Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -14085,7 +14085,8 @@ SSID == SyncScope::System || SSID == RMW->getContext().getOrInsertSyncScopeID("one-as"); - switch (RMW->getOperation()) { + auto Op = RMW->getOperation(); + switch (Op) { case AtomicRMWInst::FAdd: { Type *Ty = RMW->getType(); @@ -14160,18 +14161,26 @@ return AtomicExpansionKind::CmpXChg; } - case AtomicRMWInst::FMin: - case AtomicRMWInst::FMax: - case AtomicRMWInst::Min: + case AtomicRMWInst::Xchg: + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + case AtomicRMWInst::And: + case AtomicRMWInst::Or: + case AtomicRMWInst::Xor: case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: - case AtomicRMWInst::UMax: { + case AtomicRMWInst::FMin: + case AtomicRMWInst::FMax: + case AtomicRMWInst::UIncWrap: + case AtomicRMWInst::UDecWrap: { if (AMDGPU::isFlatGlobalAddrSpace(AS)) { - if (RMW->getType()->isFloatTy() && + if (AtomicRMWInst::isFPOperation(Op) && unsafeFPAtomicsDisabled(RMW->getFunction())) return AtomicExpansionKind::CmpXChg; - // Always expand system scope min/max atomics. + // Always expand system scope atomics. if (HasSystemScope) return AtomicExpansionKind::CmpXChg; } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -477,77 +477,172 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; CI-NEXT: s_mov_b64 s[2:3], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 16 -; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_load_dword s0, s[6:7], 0x4 +; CI-NEXT: s_add_u32 s6, s6, 16 +; CI-NEXT: s_addc_u32 s7, s7, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: .LBB6_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v1, v0 +; CI-NEXT: v_subrev_i32_e32 v0, vcc, 1, v1 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; CI-NEXT: v_mov_b32_e32 v2, s6 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s7 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; CI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; CI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; CI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; CI-NEXT: s_cbranch_execnz .LBB6_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[2:3] +; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: v_mov_b32_e32 v2, s5 +; CI-NEXT: flat_store_dword v[1:2], v0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 16 -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_load_dword s0, s[6:7], 0x10 +; VI-NEXT: s_add_u32 s6, s6, 16 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: .LBB6_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: s_cbranch_execnz .LBB6_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v2 +; GFX9-NEXT: v_subrev_u32_e32 v1, 1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 42, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc +; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[6:7] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB6_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_cmp_lt_u32_e64 s0, 42, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 1, v2 +; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 42, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc +; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[6:7] offset:16 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: v_cmp_lt_u32_e64 s0, 42, v2 +; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 42, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[6:7] offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -698,67 +793,153 @@ ; CI-LABEL: global_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b64 s[2:3], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 16 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_load_dword s6, s[0:1], 0x4 +; CI-NEXT: s_add_u32 s4, s0, 16 +; CI-NEXT: s_addc_u32 s5, s1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s6 +; CI-NEXT: .LBB9_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_subrev_i32_e32 v0, vcc, 1, v1 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_dec v[0:1], v2 +; CI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; CI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; CI-NEXT: v_mov_b32_e32 v1, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; CI-NEXT: s_cbranch_execnz .LBB9_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_load_dword s6, s[0:1], 0x10 +; VI-NEXT: s_add_u32 s4, s0, 16 +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: .LBB9_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: s_cbranch_execnz .LBB9_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; GFX9-NEXT: v_subrev_u32_e32 v0, 1, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v1, v0, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_cmp_lt_u32_e64 s0, 42, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 1, v1 +; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_dec v1, v0, s[0:1] offset:16 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cmp_lt_u32_e64 s0, 42, v1 +; GFX11-NEXT: v_subrev_nc_u32_e32 v0, 1, v1 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] offset:16 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 @@ -1124,90 +1305,182 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 16 -; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: s_add_u32 s2, s6, 16 +; CI-NEXT: s_addc_u32 s3, s7, 0 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: s_mov_b64 s[6:7], 0 +; CI-NEXT: .LBB14_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; CI-NEXT: v_mov_b32_e32 v1, v0 +; CI-NEXT: v_subrev_i32_e32 v0, vcc, 1, v1 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; CI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; CI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; CI-NEXT: s_cbranch_execnz .LBB14_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[6:7] +; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: v_mov_b32_e32 v2, s5 +; CI-NEXT: flat_store_dword v[1:2], v0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 16 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: s_add_u32 s2, s6, 16 +; VI-NEXT: s_addc_u32 s3, s7, 0 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB14_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB14_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_subrev_u32_e32 v0, 1, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: flat_store_dword v[1:2], v0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s2, s2, 16 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_add_u32 s2, s6, 16 +; GFX10-NEXT: s_addc_u32 s3, s7, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: flat_load_dword v0, v[0:1] +; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_cmp_lt_u32_e64 s0, 42, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 1, v1 +; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:16 +; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6 +; GFX11-NEXT: v_mov_b32_e32 v3, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cmp_lt_u32_e64 s0, 42, v1 +; GFX11-NEXT: v_subrev_nc_u32_e32 v0, 1, v1 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s5 +; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4 @@ -1374,77 +1647,162 @@ ; CI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b64 s[4:5], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 16 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_add_u32 s2, s0, 16 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dword v1, v[0:1] +; CI-NEXT: .LBB17_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_dec v[0:1], v2 +; CI-NEXT: v_subrev_i32_e32 v0, vcc, 1, v1 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; CI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CI-NEXT: v_mov_b32_e32 v1, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CI-NEXT: s_cbranch_execnz .LBB17_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: .LBB17_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB17_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec v[0:1], v2 offset:16 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; GFX9-NEXT: v_subrev_u32_e32 v0, 1, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s0, 16 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_add_u32 s2, s0, 16 +; GFX10-NEXT: s_addc_u32 s3, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: flat_load_dword v1, v[0:1] +; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_cmp_lt_u32_e64 s0, 42, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 1, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_dec v[0:1], v2 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_load_b32 v1, v[0:1] offset:16 +; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cmp_lt_u32_e64 s0, 42, v1 +; GFX11-NEXT: v_subrev_nc_u32_e32 v0, 1, v1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:16 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4 @@ -2053,82 +2411,188 @@ ; CI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b64 s[4:5], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 32 +; CI-NEXT: s_add_u32 s2, s0, 32 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: s_add_u32 s0, s0, 36 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: flat_load_dword v2, v[0:1] +; CI-NEXT: flat_load_dword v3, v[3:4] +; CI-NEXT: .LBB24_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; CI-NEXT: v_subrev_i32_e32 v0, vcc, 1, v2 +; CI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc +; CI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; CI-NEXT: v_cmp_lt_u64_e64 s[0:1], 42, v[2:3] +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; CI-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CI-NEXT: v_mov_b32_e32 v3, v1 +; CI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CI-NEXT: s_cbranch_execnz .LBB24_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_add_u32 s0, s0, 36 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: flat_load_dword v3, v[3:4] +; VI-NEXT: .LBB24_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: v_subrev_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[0:1], 42, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB24_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[0:1], 42, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB24_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s0, 32 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_add_u32 s2, s0, 32 +; GFX10-NEXT: s_addc_u32 s3, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, 42, v[2:3] +; GFX10-NEXT: v_sub_co_u32 v0, s1, v2, 1 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s1, 0, v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] offset:32 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, 42, v[2:3] +; GFX11-NEXT: v_sub_co_u32 v0, s1, v2, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_subrev_co_ci_u32_e64 v1, s1, 0, v3, s1 +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8 @@ -2961,82 +3425,193 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; CI-NEXT: s_mov_b64 s[8:9], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 32 -; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; CI-NEXT: s_add_u32 s6, s6, 32 +; CI-NEXT: s_addc_u32 s7, s7, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: .LBB34_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v3, v1 +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; CI-NEXT: v_cmp_lt_u64_e64 s[0:1], 42, v[2:3] +; CI-NEXT: v_subrev_i32_e64 v0, s[2:3], 1, v2 +; CI-NEXT: v_mov_b32_e32 v4, s6 +; CI-NEXT: v_subbrev_u32_e64 v1, s[2:3], 0, v3, s[2:3] +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v5, s7 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; CI-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; CI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; CI-NEXT: s_cbranch_execnz .LBB34_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[8:9] +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_ret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 32 -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x20 +; VI-NEXT: s_add_u32 s6, s6, 32 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: .LBB34_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[0:1], 42, v[2:3] +; VI-NEXT: v_subrev_u32_e64 v0, s[2:3], 1, v2 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_subbrev_u32_e64 v1, s[2:3], 0, v3, s[2:3] +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB34_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x20 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[5:6] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[0:1], 42, v[5:6] +; GFX9-NEXT: v_subrev_co_u32_e64 v0, s[2:3], 1, v5 +; GFX9-NEXT: v_subbrev_co_u32_e64 v1, s[2:3], 0, v6, s[2:3] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v0, 42, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, 0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[6:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB34_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x20 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[5:6] +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, 42, v[5:6] +; GFX10-NEXT: v_sub_co_u32 v0, s1, v5, 1 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s1, 0, v6, s1 +; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, 42, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[6:7] offset:32 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[5:6] +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x20 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[5:6] +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, 42, v[5:6] +; GFX11-NEXT: v_sub_co_u32 v0, s1, v5, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_subrev_co_ci_u32_e64 v1, s1, 0, v6, s1 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v0, 42, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v1, 0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[0:1], v2, v[3:6], s[6:7] offset:32 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[5:6] +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -3197,72 +3772,175 @@ ; CI-LABEL: global_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b64 s[2:3], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 32 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x8 +; CI-NEXT: s_add_u32 s4, s0, 32 +; CI-NEXT: s_addc_u32 s5, s1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s6 +; CI-NEXT: v_mov_b32_e32 v3, s7 +; CI-NEXT: .LBB37_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_subrev_i32_e32 v0, vcc, 1, v2 +; CI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc +; CI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; CI-NEXT: v_cmp_lt_u64_e64 s[0:1], 42, v[2:3] +; CI-NEXT: v_mov_b32_e32 v4, s4 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v5, s5 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; CI-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; CI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CI-NEXT: v_mov_b32_e32 v3, v1 +; CI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; CI-NEXT: s_cbranch_execnz .LBB37_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 32 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; VI-NEXT: s_add_u32 s4, s0, 32 +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: .LBB37_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[0:1], 42, v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: s_cbranch_execnz .LBB37_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[0:1], 42, v[2:3] +; GFX9-NEXT: v_subrev_co_u32_e64 v0, s[2:3], 1, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v1, s[2:3], 0, v3, s[2:3] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB37_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x20 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, 42, v[2:3] +; GFX10-NEXT: v_sub_co_u32 v0, s1, v2, 1 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s1, 0, v3, s1 +; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] offset:32 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB37_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x20 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, 42, v[2:3] +; GFX11-NEXT: v_sub_co_u32 v0, s1, v2, 1 +; GFX11-NEXT: v_subrev_co_ci_u32_e64 v1, s1, 0, v3, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] offset:32 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -478,76 +478,160 @@ ; CI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b64 s[4:5], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dword s6, s[2:3], 0x4 ; CI-NEXT: s_add_u32 s2, s2, 16 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s6 +; CI-NEXT: .LBB6_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v1, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; CI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; CI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CI-NEXT: s_cbranch_execnz .LBB6_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[4:5] +; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: flat_store_dword v[1:2], v0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s6, s[2:3], 0x10 ; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: .LBB6_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB6_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_add_u32_e32 v1, 1, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 42, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc +; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[2:3] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB6_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc +; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[2:3] offset:16 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[2:3] offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -697,68 +781,143 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; CI-NEXT: s_mov_b64 s[0:1], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 16 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_load_dword s4, s[2:3], 0x4 +; CI-NEXT: s_add_u32 s2, s2, 16 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: .LBB9_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_inc v[0:1], v2 +; CI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v1, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; CI-NEXT: s_cbranch_execnz .LBB9_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_load_dword s4, s[2:3], 0x10 +; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: .LBB9_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_cbranch_execnz .LBB9_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_u32_e32 v0, 1, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v1, v0, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_inc v1, v0, s[0:1] offset:16 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v1 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 @@ -1528,17 +1687,35 @@ ; CI-LABEL: global_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b64 s[4:5], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x8 ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s6 +; CI-NEXT: v_mov_b32_e32 v1, s7 +; CI-NEXT: .LBB19_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v3, v1 +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; CI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; CI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CI-NEXT: s_cbranch_execnz .LBB19_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[4:5] ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1547,17 +1724,35 @@ ; VI-LABEL: global_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x20 ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: .LBB19_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB19_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1566,42 +1761,105 @@ ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x20 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, 42, v[5:6] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[2:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB19_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v5, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[5:6] +; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[2:3] offset:32 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[5:6] +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x20 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v5, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v6, vcc_lo +; GFX11-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[5:6] +; GFX11-NEXT: v_cndmask_b32_e64 v3, v0, 0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[0:1], v2, v[3:6], s[2:3] offset:32 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[5:6] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1762,73 +2020,163 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; CI-NEXT: s_mov_b64 s[0:1], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 32 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; CI-NEXT: s_add_u32 s2, s2, 32 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: .LBB22_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; CI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; CI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; CI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CI-NEXT: v_mov_b32_e32 v3, v1 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; CI-NEXT: s_cbranch_execnz .LBB22_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 32 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20 +; VI-NEXT: s_add_u32 s2, s2, 32 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: .LBB22_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_cbranch_execnz .LBB22_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB22_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x20 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x20 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, 1 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; GFX11-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 @@ -2205,89 +2553,169 @@ ; CI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b64 s[4:5], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: .LBB27_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; CI-NEXT: v_mov_b32_e32 v1, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; CI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CI-NEXT: s_cbranch_execnz .LBB27_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[4:5] +; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: flat_store_dword v[1:2], v0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: .LBB27_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB27_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GFX9-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_add_u32_e32 v0, 1, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB27_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: flat_store_dword v[1:2], v0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: flat_load_dword v0, v[0:1] +; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:16 +; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_add_nc_u32 v0, 1, v1 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4 @@ -2454,77 +2882,151 @@ ; CI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b64 s[2:3], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_load_dword v1, v[0:1] +; CI-NEXT: .LBB30_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_inc v[0:1], v2 +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; CI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; CI-NEXT: v_mov_b32_e32 v1, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; CI-NEXT: s_cbranch_execnz .LBB30_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: .LBB30_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: s_cbranch_execnz .LBB30_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc v[0:1], v2 offset:16 +; GFX9-NEXT: v_add_u32_e32 v0, 1, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB30_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: flat_load_dword v1, v[0:1] +; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_inc v[0:1], v2 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_b32 v1, v[0:1] offset:16 +; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_add_nc_u32 v0, 1, v1 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4 @@ -3066,63 +3568,127 @@ ; CI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 32 +; CI-NEXT: s_add_u32 s4, s2, 32 +; CI-NEXT: s_addc_u32 s5, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: s_add_u32 s2, s2, 36 +; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: flat_load_dword v1, v[2:3] +; CI-NEXT: s_mov_b64 s[2:3], 0 +; CI-NEXT: .LBB36_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: v_mov_b32_e32 v3, v1 +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; CI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; CI-NEXT: v_mov_b32_e32 v4, s4 +; CI-NEXT: v_mov_b32_e32 v5, s5 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; CI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; CI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; CI-NEXT: s_cbranch_execnz .LBB36_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[2:3] ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_add_u32 s0, s0, 4 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: flat_store_dword v[2:3], v0 -; CI-NEXT: flat_store_dword v[4:5], v1 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dword v[2:3], v1 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 32 +; VI-NEXT: s_add_u32 s4, s2, 32 +; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_add_u32 s2, s2, 36 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: s_mov_b64 s[2:3], 0 +; VI-NEXT: .LBB36_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: s_cbranch_execnz .LBB36_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s0, 4 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dword v[2:3], v0 -; VI-NEXT: flat_store_dword v[4:5], v1 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[2:3], v1 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB36_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3131,19 +3697,37 @@ ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 32 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB36_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3152,16 +3736,36 @@ ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, 1 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; GFX11-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm @@ -3339,83 +3943,177 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 32 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_add_u32 s0, s2, 32 +; CI-NEXT: s_addc_u32 s1, s3, 0 +; CI-NEXT: s_add_u32 s2, s2, 36 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v4, s3 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v3, s2 +; CI-NEXT: flat_load_dword v2, v[0:1] +; CI-NEXT: flat_load_dword v3, v[3:4] +; CI-NEXT: s_mov_b64 s[2:3], 0 +; CI-NEXT: .LBB39_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; CI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; CI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CI-NEXT: v_mov_b32_e32 v3, v1 +; CI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; CI-NEXT: s_cbranch_execnz .LBB39_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 32 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_u32 s0, s2, 32 +; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s2, s2, 36 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: flat_load_dword v3, v[3:4] +; VI-NEXT: s_mov_b64 s[2:3], 0 +; VI-NEXT: .LBB39_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: s_cbranch_execnz .LBB39_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB39_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 32 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v5, s1 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz .LBB39_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] offset:32 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, 1 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; GFX11-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8 Index: llvm/test/CodeGen/AMDGPU/flat_atomics.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -4027,11 +4027,23 @@ ; GCN1-NEXT: s_addc_u32 s1, s3, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[2:3], 0 +; GCN1-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN1-NEXT: s_cbranch_execnz .LBB67_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_f32_offset: @@ -4043,25 +4055,49 @@ ; GCN2-NEXT: s_addc_u32 s1, s3, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[2:3], 0 +; GCN2-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN2-NEXT: s_cbranch_execnz .LBB67_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm ; ; GCN3-LABEL: atomic_xchg_f32_offset: ; GCN3: ; %bb.0: ; %entry ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_mov_b64 s[0:1], 0 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s2 ; GCN3-NEXT: v_mov_b32_e32 v1, s3 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s2 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN3-NEXT: s_cbranch_execnz .LBB67_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: %gep = getelementptr float, ptr %out, i32 4 Index: llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -11,25 +11,61 @@ ; GCN1-LABEL: flat_atomic_xchg_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB0_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v3, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB0_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB0_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v3, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB0_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap v[0:1], v2 +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB0_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v3, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB0_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr %ptr, i32 %in seq_cst ret void @@ -41,10 +77,21 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB1_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v3, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB1_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i32_noret_offset: @@ -52,18 +99,41 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB1_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v3, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB1_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB1_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v3, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB1_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst @@ -74,25 +144,67 @@ ; GCN1-LABEL: flat_atomic_xchg_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB2_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v4 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB2_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB2_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v4 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB2_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB2_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB2_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr %ptr, i32 %in seq_cst ret i32 %result @@ -102,31 +214,69 @@ ; GCN1-LABEL: flat_atomic_xchg_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB3_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB3_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB3_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v3, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB3_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB3_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB3_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, i32 %in seq_cst @@ -139,11 +289,24 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB4_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB4_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i32_noret_scalar: @@ -151,11 +314,24 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB4_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB4_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i32_noret_scalar: @@ -163,11 +339,24 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB4_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap v[0:1], v2 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB4_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr %ptr, i32 %in seq_cst ret void @@ -181,11 +370,24 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB5_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB5_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i32_noret_offset_scalar: @@ -195,11 +397,24 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB5_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB5_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i32_noret_offset_scalar: @@ -207,11 +422,24 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB5_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB5_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst @@ -224,11 +452,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB6_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v3, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s6 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB6_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i32_ret_scalar: @@ -236,11 +478,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB6_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v3, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s6 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB6_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i32_ret_scalar: @@ -248,11 +504,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB6_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v3, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s6 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB6_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr %ptr, i32 %in seq_cst ret i32 %result @@ -266,11 +536,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB7_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s6 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s35 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB7_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i32_ret_offset_scalar: @@ -280,11 +564,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB7_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s6 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s35 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB7_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i32_ret_offset_scalar: @@ -292,11 +590,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB7_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v3, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s6 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB7_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, i32 %in seq_cst @@ -311,25 +623,61 @@ ; GCN1-LABEL: flat_atomic_xchg_f32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB8_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v3, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB8_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB8_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v3, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB8_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap v[0:1], v2 +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB8_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v3, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB8_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr %ptr, float %in seq_cst ret void @@ -341,10 +689,21 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB9_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v3, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB9_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f32_noret_offset: @@ -352,18 +711,41 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB9_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v3, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB9_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB9_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v3, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB9_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst @@ -374,25 +756,67 @@ ; GCN1-LABEL: flat_atomic_xchg_f32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB10_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v4 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB10_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB10_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v4 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB10_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB10_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB10_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr %ptr, float %in seq_cst ret float %result @@ -402,31 +826,69 @@ ; GCN1-LABEL: flat_atomic_xchg_f32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB11_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB11_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB11_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v3, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB11_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB11_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB11_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, float %in seq_cst @@ -439,11 +901,24 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB12_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB12_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f32_noret_scalar: @@ -451,11 +926,24 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB12_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB12_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f32_noret_scalar: @@ -463,11 +951,24 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB12_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap v[0:1], v2 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB12_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr %ptr, float %in seq_cst ret void @@ -481,11 +982,24 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB13_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB13_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f32_noret_offset_scalar: @@ -495,11 +1009,24 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB13_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB13_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f32_noret_offset_scalar: @@ -507,11 +1034,24 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB13_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB13_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst @@ -524,11 +1064,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB14_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v3, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s6 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB14_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f32_ret_scalar: @@ -536,11 +1090,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB14_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v3, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s6 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB14_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f32_ret_scalar: @@ -548,11 +1116,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB14_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v3, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s6 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB14_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr %ptr, float %in seq_cst ret float %result @@ -566,11 +1148,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB15_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s6 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s35 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB15_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f32_ret_offset_scalar: @@ -580,11 +1176,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB15_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s6 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s35 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB15_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f32_ret_offset_scalar: @@ -592,11 +1202,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB15_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v3, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s6 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB15_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, float %in seq_cst @@ -611,25 +1235,67 @@ ; GCN1-LABEL: flat_atomic_add_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_add v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB16_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB16_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_add v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB16_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB16_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB16_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_u32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB16_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw add ptr %ptr, i32 %in seq_cst ret void @@ -641,10 +1307,23 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB17_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, v4, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_add v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB17_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i32_noret_offset: @@ -652,47 +1331,119 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB17_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_add v[0:1], v2 +; GCN2-NEXT: v_add_u32_e32 v3, vcc, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB17_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB17_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_u32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 - %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst - ret void +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB17_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, ptr %out, i32 4 + %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst + ret void } define i32 @flat_atomic_add_i32_ret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_add_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB18_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_add_i32_e32 v3, vcc, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB18_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB18_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_add_u32_e32 v3, vcc, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB18_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB18_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_add_u32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB18_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw add ptr %ptr, i32 %in seq_cst ret i32 %result @@ -702,31 +1453,72 @@ ; GCN1-LABEL: flat_atomic_add_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB19_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB19_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB19_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, v1, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB19_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB19_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_add_u32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB19_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw add ptr %gep, i32 %in seq_cst @@ -739,11 +1531,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB20_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_add v[0:1], v2 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB20_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i32_noret_scalar: @@ -751,11 +1557,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB20_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_add v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB20_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i32_noret_scalar: @@ -763,11 +1583,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB20_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_u32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add v[0:1], v2 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB20_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw add ptr %ptr, i32 %in seq_cst ret void @@ -781,11 +1615,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB21_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_add v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB21_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i32_noret_offset_scalar: @@ -795,11 +1643,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB21_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_add v[0:1], v2 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB21_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i32_noret_offset_scalar: @@ -807,11 +1669,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB21_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_u32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB21_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst @@ -824,11 +1700,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB22_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, s6, v1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB22_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i32_ret_scalar: @@ -836,11 +1726,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB22_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, s6, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB22_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i32_ret_scalar: @@ -848,11 +1752,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB22_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_add_u32_e32 v0, s6, v1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB22_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw add ptr %ptr, i32 %in seq_cst ret i32 %result @@ -866,11 +1784,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB23_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, s6, v1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB23_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i32_ret_offset_scalar: @@ -880,11 +1812,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB23_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, s6, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB23_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i32_ret_offset_scalar: @@ -892,11 +1838,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB23_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_add_u32_e32 v0, s6, v1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB23_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw add ptr %gep, i32 %in seq_cst @@ -911,25 +1871,67 @@ ; GCN1-LABEL: flat_atomic_sub_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB24_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_setpc_b64 s[30:31] -; +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB24_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; ; GCN2-LABEL: flat_atomic_sub_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB24_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB24_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB24_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB24_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr %ptr, i32 %in seq_cst ret void @@ -941,10 +1943,23 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB25_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB25_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_noret_offset: @@ -952,18 +1967,45 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB25_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v[0:1], v2 +; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB25_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB25_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB25_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst @@ -974,25 +2016,70 @@ ; GCN1-LABEL: flat_atomic_sub_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB26_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB26_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB26_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB26_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB26_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB26_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i32 %in seq_cst ret i32 %result @@ -1002,31 +2089,72 @@ ; GCN1-LABEL: flat_atomic_sub_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB27_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB27_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB27_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v1, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB27_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB27_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB27_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw sub ptr %gep, i32 %in seq_cst @@ -1039,11 +2167,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB28_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v[0:1], v2 +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB28_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_noret_scalar: @@ -1051,11 +2193,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB28_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB28_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_noret_scalar: @@ -1063,11 +2219,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB28_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_subrev_u32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v[0:1], v2 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB28_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr %ptr, i32 %in seq_cst ret void @@ -1081,11 +2251,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB29_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB29_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_noret_offset_scalar: @@ -1095,11 +2279,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB29_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v[0:1], v2 +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB29_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_noret_offset_scalar: @@ -1107,11 +2305,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB29_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_subrev_u32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB29_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst @@ -1124,11 +2336,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB30_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_ret_scalar: @@ -1136,11 +2362,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB30_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_ret_scalar: @@ -1148,11 +2388,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_subrev_u32_e32 v0, s6, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB30_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i32 %in seq_cst ret i32 %result @@ -1166,11 +2420,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB31_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_ret_offset_scalar: @@ -1180,11 +2448,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB31_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_ret_offset_scalar: @@ -1192,11 +2474,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_subrev_u32_e32 v0, s6, v1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB31_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw sub ptr %gep, i32 %in seq_cst @@ -1211,25 +2507,67 @@ ; GCN1-LABEL: flat_atomic_and_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB32_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB32_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB32_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i32 %in seq_cst ret void @@ -1241,10 +2579,23 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v[0:1], v2 +; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB33_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_noret_offset: @@ -1252,18 +2603,45 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v[0:1], v2 +; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB33_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB33_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst @@ -1274,25 +2652,70 @@ ; GCN1-LABEL: flat_atomic_and_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB34_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB34_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB34_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i32 %in seq_cst ret i32 %result @@ -1302,31 +2725,72 @@ ; GCN1-LABEL: flat_atomic_and_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_and_b32_e32 v0, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB35_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_and_b32_e32 v0, v1, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB35_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB35_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw and ptr %gep, i32 %in seq_cst @@ -1339,11 +2803,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v[0:1], v2 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB36_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_noret_scalar: @@ -1351,11 +2829,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v[0:1], v2 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB36_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_noret_scalar: @@ -1363,11 +2855,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v[0:1], v2 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB36_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i32 %in seq_cst ret void @@ -1381,11 +2887,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v[0:1], v2 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB37_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_noret_offset_scalar: @@ -1395,11 +2915,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB37_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_noret_offset_scalar: @@ -1407,11 +2941,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB37_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst @@ -1424,11 +2972,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB38_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB38_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_ret_scalar: @@ -1436,11 +2998,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB38_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB38_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_ret_scalar: @@ -1448,11 +3024,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB38_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB38_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i32 %in seq_cst ret i32 %result @@ -1466,11 +3056,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB39_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB39_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_ret_offset_scalar: @@ -1480,11 +3084,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB39_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB39_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_ret_offset_scalar: @@ -1492,11 +3110,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB39_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB39_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw and ptr %gep, i32 %in seq_cst @@ -2171,25 +3803,67 @@ ; GCN1-LABEL: flat_atomic_or_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB48_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB48_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB48_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB48_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB48_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB48_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i32 %in seq_cst ret void @@ -2201,10 +3875,23 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB49_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v4, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB49_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_noret_offset: @@ -2212,18 +3899,45 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB49_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v[0:1], v2 +; GCN2-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB49_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB49_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB49_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst @@ -2234,25 +3948,70 @@ ; GCN1-LABEL: flat_atomic_or_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB50_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB50_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB50_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i32 %in seq_cst ret i32 %result @@ -2262,31 +4021,72 @@ ; GCN1-LABEL: flat_atomic_or_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB51_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_or_b32_e32 v0, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB51_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB51_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_or_b32_e32 v0, v1, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB51_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB51_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB51_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw or ptr %gep, i32 %in seq_cst @@ -2299,11 +4099,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v[0:1], v2 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB52_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_noret_scalar: @@ -2311,11 +4125,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB52_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_noret_scalar: @@ -2323,11 +4151,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v[0:1], v2 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB52_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i32 %in seq_cst ret void @@ -2341,11 +4183,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB53_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB53_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_noret_offset_scalar: @@ -2355,11 +4211,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB53_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v[0:1], v2 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB53_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_noret_offset_scalar: @@ -2367,11 +4237,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB53_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB53_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst @@ -2384,11 +4268,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB54_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB54_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_ret_scalar: @@ -2396,11 +4294,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB54_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB54_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_ret_scalar: @@ -2408,11 +4320,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB54_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB54_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i32 %in seq_cst ret i32 %result @@ -2426,11 +4352,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB55_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_ret_offset_scalar: @@ -2440,11 +4380,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB55_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_ret_offset_scalar: @@ -2452,11 +4406,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB55_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw or ptr %gep, i32 %in seq_cst @@ -2471,25 +4439,67 @@ ; GCN1-LABEL: flat_atomic_xor_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB56_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB56_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB56_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst ret void @@ -2501,10 +4511,23 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB57_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v4, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB57_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_noret_offset: @@ -2512,18 +4535,45 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB57_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB57_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB57_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst @@ -2534,25 +4584,70 @@ ; GCN1-LABEL: flat_atomic_xor_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB58_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB58_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB58_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i32 %in seq_cst ret i32 %result @@ -2562,31 +4657,72 @@ ; GCN1-LABEL: flat_atomic_xor_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_xor_b32_e32 v0, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB59_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_xor_b32_e32 v0, v1, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB59_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB59_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw xor ptr %gep, i32 %in seq_cst @@ -2599,11 +4735,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB60_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_noret_scalar: @@ -2611,11 +4761,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB60_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_noret_scalar: @@ -2623,11 +4787,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB60_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst ret void @@ -2641,11 +4819,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB61_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_noret_offset_scalar: @@ -2655,11 +4847,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB61_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_noret_offset_scalar: @@ -2667,11 +4873,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB61_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst @@ -2684,11 +4904,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB62_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_ret_scalar: @@ -2696,11 +4930,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB62_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_ret_scalar: @@ -2708,11 +4956,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB62_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i32 %in seq_cst ret i32 %result @@ -2726,11 +4988,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB63_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_ret_offset_scalar: @@ -2740,11 +5016,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB63_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_ret_offset_scalar: @@ -2752,11 +5042,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB63_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw xor ptr %gep, i32 %in seq_cst @@ -6517,25 +8821,73 @@ ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB107_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB107_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 +; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB107_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst ret void @@ -6547,10 +8899,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB108_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_offset: @@ -6558,18 +8925,49 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v[0:1], v2 +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB108_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 +; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB108_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst @@ -6580,25 +8978,76 @@ ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB109_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB109_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 +; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB109_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst ret i32 %result @@ -6608,31 +9057,78 @@ ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB110_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB110_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 +; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB110_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst @@ -6645,11 +9141,27 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB111_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar: @@ -6657,11 +9169,27 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v[0:1], v2 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB111_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar: @@ -6669,11 +9197,27 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_u32_e32 v0, 1, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v[0:1], v2 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB111_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst ret void @@ -6687,11 +9231,27 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB112_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar: @@ -6701,11 +9261,27 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB112_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar: @@ -6713,11 +9289,27 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 +; GCN3-NEXT: v_add_u32_e32 v0, 1, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB112_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst @@ -6730,11 +9322,27 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB113_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar: @@ -6742,11 +9350,27 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB113_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar: @@ -6754,11 +9378,27 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_add_u32_e32 v0, 1, v1 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB113_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst ret i32 %result @@ -6772,11 +9412,27 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB114_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar: @@ -6786,11 +9442,27 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB114_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar: @@ -6798,11 +9470,27 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_add_u32_e32 v0, 1, v1 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB114_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst @@ -6817,25 +9505,79 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB115_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB115_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB115_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst ret void @@ -6847,10 +9589,27 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB116_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_offset: @@ -6858,18 +9617,53 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v[0:1], v2 +; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB116_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB116_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst @@ -6880,25 +9674,82 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB117_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB117_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB117_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst ret i32 %result @@ -6908,31 +9759,84 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB118_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB118_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB118_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst @@ -6945,11 +9849,30 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB119_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_scalar: @@ -6957,11 +9880,30 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v[0:1], v2 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB119_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_scalar: @@ -6969,11 +9911,30 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v[0:1], v2 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN3-NEXT: v_add_u32_e32 v0, -1, v1 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB119_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst ret void @@ -6983,29 +9944,67 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 16 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: s_add_u32 s36, s4, 16 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[38:39], 0 +; GCN1-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s36 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, s37 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_cbranch_execnz .LBB120_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 16 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: s_add_u32 s36, s4, 16 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[38:39], 0 +; GCN2-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v[0:1], v2 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s36 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, s37 +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_cbranch_execnz .LBB120_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar: @@ -7013,11 +10012,30 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN3-NEXT: v_add_u32_e32 v0, -1, v1 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB120_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst @@ -7030,11 +10048,30 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, -1, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB121_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_scalar: @@ -7042,11 +10079,30 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, -1, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB121_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_scalar: @@ -7054,11 +10110,30 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_add_u32_e32 v4, -1, v1 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB121_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst ret i32 %result @@ -7068,29 +10143,67 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 16 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: s_add_u32 s36, s4, 16 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[38:39], 0 +; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, -1, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s36 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, s37 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_cbranch_execnz .LBB122_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 16 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: s_add_u32 s36, s4, 16 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[38:39], 0 +; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, -1, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s36 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, s37 +; GCN2-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_cbranch_execnz .LBB122_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar: @@ -7098,11 +10211,30 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_add_u32_e32 v4, -1, v1 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB122_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst Index: llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -2796,33 +2796,59 @@ ; GCN1-LABEL: atomic_xchg_f64_offset: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB65_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_f64_offset: ; GCN2: ; %bb.0: ; %entry ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB65_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm entry: %gep = getelementptr double, ptr %out, i64 4 @@ -2834,33 +2860,59 @@ ; GCN1-LABEL: atomic_xchg_pointer_offset: ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b64 s[4:5], 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s0, s0, 32 ; GCN1-NEXT: s_addc_u32 s1, s1, 0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB66_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_pointer_offset: ; GCN2: ; %bb.0: ; %entry ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_mov_b64 s[4:5], 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s0, s0, 32 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB66_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN2-NEXT: s_endpgm entry: %gep = getelementptr ptr, ptr %out, i32 4 Index: llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -11,25 +11,70 @@ ; GCN1-LABEL: flat_atomic_xchg_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: flat_load_dword v5, v[5:6] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB0_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v6 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v5, v7 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB0_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: flat_load_dword v5, v[5:6] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB0_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v6 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v5, v7 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB0_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB0_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v6 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v5, v7 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB0_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst ret void @@ -39,31 +84,74 @@ ; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[0:1] +; GCN1-NEXT: flat_load_dword v4, v[6:7] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB1_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[2:5] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB1_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v6, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[0:1] +; GCN2-NEXT: flat_load_dword v4, v[6:7] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB1_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[2:5] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB1_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB1_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v6 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v5, v7 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB1_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst @@ -74,25 +162,79 @@ ; GCN1-LABEL: flat_atomic_xchg_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB2_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v6 +; GCN1-NEXT: v_mov_b32_e32 v5, v7 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB2_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v6 +; GCN1-NEXT: v_mov_b32_e32 v1, v7 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB2_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v6 +; GCN2-NEXT: v_mov_b32_e32 v5, v7 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB2_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v6 +; GCN2-NEXT: v_mov_b32_e32 v1, v7 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB2_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v6 +; GCN3-NEXT: v_mov_b32_e32 v5, v7 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB2_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v6 +; GCN3-NEXT: v_mov_b32_e32 v1, v7 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr %ptr, i64 %in seq_cst ret i64 %result @@ -102,31 +244,79 @@ ; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[6:7] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB3_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[2:5] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB3_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v6, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[6:7] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB3_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[2:5] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB3_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB3_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v6 +; GCN3-NEXT: v_mov_b32_e32 v5, v7 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB3_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v6 +; GCN3-NEXT: v_mov_b32_e32 v1, v7 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst @@ -137,40 +327,92 @@ ; GCN1-LABEL: flat_atomic_xchg_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: v_mov_b32_e32 v4, s35 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB4_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB4_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: v_mov_b32_e32 v4, s35 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB4_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB4_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB4_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB4_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst ret void @@ -182,14 +424,33 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB5_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB5_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset_scalar: @@ -197,27 +458,60 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB5_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB5_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB5_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB5_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst @@ -228,40 +522,95 @@ ; GCN1-LABEL: flat_atomic_xchg_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB6_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB6_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: flat_atomic_xchg_i64_ret_scalar: -; GCN3: ; %bb.0: +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB6_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB6_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_setpc_b64 s[30:31] +; +; GCN3-LABEL: flat_atomic_xchg_i64_ret_scalar: +; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB6_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB6_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr %ptr, i64 %in seq_cst ret i64 %result @@ -273,14 +622,34 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB7_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB7_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset_scalar: @@ -288,27 +657,62 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB7_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB7_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB7_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB7_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst @@ -323,25 +727,70 @@ ; GCN1-LABEL: flat_atomic_xchg_f64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: flat_load_dword v5, v[5:6] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB8_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v6 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v5, v7 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB8_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: flat_load_dword v5, v[5:6] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB8_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v6 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v5, v7 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB8_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB8_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v6 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v5, v7 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB8_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst ret void @@ -351,31 +800,74 @@ ; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[0:1] +; GCN1-NEXT: flat_load_dword v4, v[6:7] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB9_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[2:5] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB9_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v6, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[0:1] +; GCN2-NEXT: flat_load_dword v4, v[6:7] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB9_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[2:5] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB9_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB9_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v6 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v5, v7 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB9_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst @@ -386,25 +878,79 @@ ; GCN1-LABEL: flat_atomic_xchg_f64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB10_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v6 +; GCN1-NEXT: v_mov_b32_e32 v5, v7 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB10_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v6 +; GCN1-NEXT: v_mov_b32_e32 v1, v7 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB10_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v6 +; GCN2-NEXT: v_mov_b32_e32 v5, v7 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB10_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v6 +; GCN2-NEXT: v_mov_b32_e32 v1, v7 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB10_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v6 +; GCN3-NEXT: v_mov_b32_e32 v5, v7 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB10_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v6 +; GCN3-NEXT: v_mov_b32_e32 v1, v7 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr %ptr, double %in seq_cst ret double %result @@ -414,31 +960,79 @@ ; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[6:7] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB11_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v5, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[2:5] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB11_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v6, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[6:7] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB11_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v5, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[2:5] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB11_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB11_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v6 +; GCN3-NEXT: v_mov_b32_e32 v5, v7 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB11_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v6 +; GCN3-NEXT: v_mov_b32_e32 v1, v7 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst @@ -449,40 +1043,92 @@ ; GCN1-LABEL: flat_atomic_xchg_f64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: v_mov_b32_e32 v4, s35 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB12_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB12_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: v_mov_b32_e32 v4, s35 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB12_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB12_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB12_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB12_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst ret void @@ -494,14 +1140,33 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB13_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB13_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset_scalar: @@ -509,27 +1174,60 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB13_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB13_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB13_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB13_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst @@ -540,40 +1238,95 @@ ; GCN1-LABEL: flat_atomic_xchg_f64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB14_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB14_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB14_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB14_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB14_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB14_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr %ptr, double %in seq_cst ret double %result @@ -585,14 +1338,34 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB15_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_mov_b32_e32 v0, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB15_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset_scalar: @@ -600,27 +1373,62 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB15_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_mov_b32_e32 v0, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB15_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB15_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: v_mov_b32_e32 v0, s6 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB15_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst @@ -635,25 +1443,79 @@ ; GCN1-LABEL: flat_atomic_add_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB16_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, v6, v2 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, v7, v3, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB16_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB16_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, v6, v2 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, v7, v3, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB16_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB16_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, v6, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB16_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst ret void @@ -663,31 +1525,83 @@ ; GCN1-LABEL: flat_atomic_add_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB17_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, v6, v2 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, v7, v3, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB17_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB17_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, v6, v2 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, v7, v3, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB17_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB17_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, v6, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB17_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst @@ -698,25 +1612,85 @@ ; GCN1-LABEL: flat_atomic_add_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: flat_load_dword v5, v[5:6] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB18_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, v6, v2 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, v7, v3, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB18_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: flat_load_dword v5, v[5:6] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB18_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, v6, v2 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, v7, v3, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB18_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB18_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, v6, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB18_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw add ptr %ptr, i64 %in seq_cst ret i64 %result @@ -726,31 +1700,85 @@ ; GCN1-LABEL: flat_atomic_add_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB19_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_add_i32_e32 v6, vcc, v8, v2 +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, v9, v3, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB19_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB19_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_add_u32_e32 v6, vcc, v8, v2 +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, v9, v3, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB19_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB19_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, v6, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB19_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst @@ -761,40 +1789,98 @@ ; GCN1-LABEL: flat_atomic_add_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: v_mov_b32_e32 v4, s35 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB20_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN1-NEXT: v_add_i32_e32 v0, vcc, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB20_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: v_mov_b32_e32 v4, s35 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB20_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN2-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB20_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB20_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB20_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst ret void @@ -806,14 +1892,35 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB21_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN1-NEXT: v_add_i32_e32 v0, vcc, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB21_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_noret_offset_scalar: @@ -821,27 +1928,64 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB21_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN2-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB21_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB21_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB21_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst @@ -852,40 +1996,98 @@ ; GCN1-LABEL: flat_atomic_add_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB22_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB22_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB22_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB22_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB22_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB22_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw add ptr %ptr, i64 %in seq_cst ret i64 %result @@ -897,14 +2099,35 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB23_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB23_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_add_i64_ret_offset_scalar: @@ -912,27 +2135,64 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB23_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB23_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_add_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB23_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB23_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst @@ -947,25 +2207,79 @@ ; GCN1-LABEL: flat_atomic_sub_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB24_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB24_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB24_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB24_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB24_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB24_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst ret void @@ -975,31 +2289,83 @@ ; GCN1-LABEL: flat_atomic_sub_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB25_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB25_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB25_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB25_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB25_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB25_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst @@ -1010,27 +2376,87 @@ ; GCN1-LABEL: flat_atomic_sub_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: flat_load_dword v5, v[5:6] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB26_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB26_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: flat_load_dword v5, v[5:6] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB26_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB26_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB26_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw sub ptr %ptr, i64 %in seq_cst +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB26_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 +; GCN3-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw sub ptr %ptr, i64 %in seq_cst ret i64 %result } @@ -1038,31 +2464,85 @@ ; GCN1-LABEL: flat_atomic_sub_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB27_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_sub_i32_e32 v6, vcc, v8, v2 +; GCN1-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB27_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB27_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_sub_u32_e32 v6, vcc, v8, v2 +; GCN2-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB27_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB27_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB27_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst @@ -1073,40 +2553,98 @@ ; GCN1-LABEL: flat_atomic_sub_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: v_mov_b32_e32 v4, s35 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB28_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB28_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: v_mov_b32_e32 v4, s35 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB28_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB28_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB28_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB28_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst ret void @@ -1118,14 +2656,35 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB29_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB29_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_noret_offset_scalar: @@ -1133,27 +2692,64 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB29_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB29_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB29_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB29_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst @@ -1164,40 +2760,98 @@ ; GCN1-LABEL: flat_atomic_sub_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB30_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB30_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB30_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i64 %in seq_cst ret i64 %result @@ -1209,14 +2863,35 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB31_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_ret_offset_scalar: @@ -1224,27 +2899,64 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB31_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB31_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst @@ -1259,25 +2971,79 @@ ; GCN1-LABEL: flat_atomic_and_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB32_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB32_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB32_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst ret void @@ -1287,31 +3053,83 @@ ; GCN1-LABEL: flat_atomic_and_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB33_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB33_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB33_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst @@ -1322,25 +3140,85 @@ ; GCN1-LABEL: flat_atomic_and_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: flat_load_dword v5, v[5:6] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB34_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: flat_load_dword v5, v[5:6] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB34_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB34_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i64 %in seq_cst ret i64 %result @@ -1350,31 +3228,85 @@ ; GCN1-LABEL: flat_atomic_and_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_and_b32_e32 v7, v9, v3 +; GCN1-NEXT: v_and_b32_e32 v6, v8, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB35_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_and_b32_e32 v7, v9, v3 +; GCN2-NEXT: v_and_b32_e32 v6, v8, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB35_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB35_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst @@ -1385,40 +3317,95 @@ ; GCN1-LABEL: flat_atomic_and_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: flat_atomic_and_i64_noret_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: v_mov_b32_e32 v4, s35 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB36_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_setpc_b64 s[30:31] +; +; GCN2-LABEL: flat_atomic_and_i64_noret_scalar: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: v_mov_b32_e32 v4, s35 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB36_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN3-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB36_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst ret void @@ -1430,14 +3417,34 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN1-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB37_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_noret_offset_scalar: @@ -1445,27 +3452,62 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN2-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB37_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB37_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst @@ -1476,40 +3518,95 @@ ; GCN1-LABEL: flat_atomic_and_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB38_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB38_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB38_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB38_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB38_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB38_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i64 %in seq_cst ret i64 %result @@ -1521,14 +3618,34 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB39_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB39_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_ret_offset_scalar: @@ -1536,27 +3653,62 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB39_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB39_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB39_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB39_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst @@ -2371,25 +4523,79 @@ ; GCN1-LABEL: flat_atomic_or_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB48_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB48_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB48_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB48_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB48_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB48_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst ret void @@ -2399,31 +4605,83 @@ ; GCN1-LABEL: flat_atomic_or_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB49_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB49_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB49_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB49_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB49_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB49_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst @@ -2434,25 +4692,85 @@ ; GCN1-LABEL: flat_atomic_or_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: flat_load_dword v5, v[5:6] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB50_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: flat_load_dword v5, v[5:6] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB50_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB50_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i64 %in seq_cst ret i64 %result @@ -2462,31 +4780,85 @@ ; GCN1-LABEL: flat_atomic_or_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB51_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_or_b32_e32 v7, v9, v3 +; GCN1-NEXT: v_or_b32_e32 v6, v8, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB51_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB51_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_or_b32_e32 v7, v9, v3 +; GCN2-NEXT: v_or_b32_e32 v6, v8, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB51_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB51_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB51_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst @@ -2497,40 +4869,95 @@ ; GCN1-LABEL: flat_atomic_or_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: v_mov_b32_e32 v4, s35 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB52_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: v_mov_b32_e32 v4, s35 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB52_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN3-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB52_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst ret void @@ -2542,14 +4969,34 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB53_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB53_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_noret_offset_scalar: @@ -2557,27 +5004,62 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB53_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB53_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB53_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB53_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst @@ -2588,40 +5070,95 @@ ; GCN1-LABEL: flat_atomic_or_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB54_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB54_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB54_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB54_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB54_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB54_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i64 %in seq_cst ret i64 %result @@ -2633,14 +5170,34 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB55_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_ret_offset_scalar: @@ -2648,27 +5205,62 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB55_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB55_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst @@ -2683,25 +5275,79 @@ ; GCN1-LABEL: flat_atomic_xor_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB56_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB56_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB56_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst ret void @@ -2711,31 +5357,83 @@ ; GCN1-LABEL: flat_atomic_xor_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB57_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB57_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB57_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB57_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB57_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst @@ -2746,25 +5444,85 @@ ; GCN1-LABEL: flat_atomic_xor_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: flat_load_dword v5, v[5:6] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB58_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: flat_load_dword v5, v[5:6] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB58_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB58_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i64 %in seq_cst ret i64 %result @@ -2774,31 +5532,85 @@ ; GCN1-LABEL: flat_atomic_xor_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_xor_b32_e32 v7, v9, v3 +; GCN1-NEXT: v_xor_b32_e32 v6, v8, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB59_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_xor_b32_e32 v7, v9, v3 +; GCN2-NEXT: v_xor_b32_e32 v6, v8, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB59_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB59_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst @@ -2809,40 +5621,95 @@ ; GCN1-LABEL: flat_atomic_xor_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: v_mov_b32_e32 v4, s35 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB60_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: v_mov_b32_e32 v4, s35 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB60_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB60_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst ret void @@ -2854,14 +5721,34 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB61_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_noret_offset_scalar: @@ -2869,27 +5756,62 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB61_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB61_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst @@ -2900,40 +5822,95 @@ ; GCN1-LABEL: flat_atomic_xor_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB62_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB62_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB62_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i64 %in seq_cst ret i64 %result @@ -2945,14 +5922,34 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB63_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_ret_offset_scalar: @@ -2960,27 +5957,62 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB63_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB63_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst @@ -7493,25 +10525,88 @@ ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB107_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB107_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB107_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst ret void @@ -7521,31 +10616,92 @@ ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v6 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB108_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v6 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB108_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB108_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst @@ -7556,25 +10712,94 @@ ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: flat_load_dword v5, v[5:6] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB109_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: flat_load_dword v5, v[5:6] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB109_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB109_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst ret i64 %result @@ -7584,31 +10809,94 @@ ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v8 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB110_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v8 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB110_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB110_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst @@ -7619,40 +10907,104 @@ ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: v_mov_b32_e32 v4, s35 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB111_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: v_mov_b32_e32 v4, s35 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB111_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB111_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst ret void @@ -7664,14 +11016,37 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB112_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: @@ -7679,27 +11054,68 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB112_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB112_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst @@ -7710,40 +11126,104 @@ ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB113_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB113_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB113_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst ret i64 %result @@ -7755,14 +11235,37 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB114_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: @@ -7770,27 +11273,68 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB114_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB114_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst @@ -7805,25 +11349,94 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[8:9], 0 +; GCN1-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6 +; GCN1-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_cbranch_execnz .LBB115_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: s_mov_b64 s[8:9], 0 +; GCN2-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; GCN2-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB115_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[8:9], 0 +; GCN3-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GCN3-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_cbranch_execnz .LBB115_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst ret void @@ -7833,31 +11446,98 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[8:9], 0 +; GCN1-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v6 +; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v7, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_cbranch_execnz .LBB116_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[8:9], 0 +; GCN2-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v6 +; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v7, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB116_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[8:9], 0 +; GCN3-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GCN3-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_cbranch_execnz .LBB116_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst @@ -7868,25 +11548,100 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: flat_load_dword v5, v[5:6] +; GCN1-NEXT: s_mov_b64 s[8:9], 0 +; GCN1-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6 +; GCN1-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_cbranch_execnz .LBB117_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: flat_load_dword v5, v[5:6] +; GCN2-NEXT: s_mov_b64 s[8:9], 0 +; GCN2-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; GCN2-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB117_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN3-NEXT: s_mov_b64 s[8:9], 0 +; GCN3-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GCN3-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_cbranch_execnz .LBB117_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst ret i64 %result @@ -7896,31 +11651,100 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[8:9], 0 +; GCN1-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v8 +; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_cbranch_execnz .LBB118_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[8:9], 0 +; GCN2-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v8 +; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB118_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[8:9], 0 +; GCN3-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GCN3-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_cbranch_execnz .LBB118_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst @@ -7931,40 +11755,116 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: v_mov_b32_e32 v4, s35 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB119_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: v_mov_b32_e32 v4, s35 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB119_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, -1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB119_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst ret void @@ -7974,44 +11874,120 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 32 +; GCN1-NEXT: s_add_u32 s36, s4, 32 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: s_add_u32 s34, s4, 36 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v4, s36 +; GCN1-NEXT: v_mov_b32_e32 v5, s37 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[38:39], 0 +; GCN1-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s36 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v5, s37 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_cbranch_execnz .LBB120_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 32 +; GCN2-NEXT: s_add_u32 s36, s4, 32 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: s_add_u32 s34, s4, 36 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v4, s36 +; GCN2-NEXT: v_mov_b32_e32 v5, s37 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[38:39], 0 +; GCN2-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s36 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v5, s37 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_cbranch_execnz .LBB120_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, -1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB120_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst @@ -8022,40 +11998,116 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: s_mov_b64 s[38:39], 0 +; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v7, s[36:37], -1, v2 +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_cbranch_execnz .LBB121_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: s_mov_b64 s[38:39], 0 +; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v7, s[36:37], -1, v2 +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_cbranch_execnz .LBB121_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[38:39], 0 +; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v7, s[36:37], -1, v2 +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN3-NEXT: s_cbranch_execnz .LBB121_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst ret i64 %result @@ -8065,44 +12117,120 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 32 +; GCN1-NEXT: s_add_u32 s38, s4, 32 +; GCN1-NEXT: s_addc_u32 s39, s5, 0 +; GCN1-NEXT: s_add_u32 s34, s4, 36 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s38 +; GCN1-NEXT: v_mov_b32_e32 v3, s39 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[40:41], 0 +; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v7, s[36:37], -1, v2 +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s38 +; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v5, s39 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN1-NEXT: s_cbranch_execnz .LBB122_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[40:41] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 32 +; GCN2-NEXT: s_add_u32 s38, s4, 32 +; GCN2-NEXT: s_addc_u32 s39, s5, 0 +; GCN2-NEXT: s_add_u32 s34, s4, 36 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s38 +; GCN2-NEXT: v_mov_b32_e32 v3, s39 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[40:41], 0 +; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v7, s[36:37], -1, v2 +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s38 +; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v5, s39 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN2-NEXT: s_cbranch_execnz .LBB122_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[40:41] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[38:39], 0 +; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v7, s[36:37], -1, v2 +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN3-NEXT: s_cbranch_execnz .LBB122_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst Index: llvm/test/CodeGen/AMDGPU/global_atomics.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -3877,43 +3877,86 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float %in) { ; SI-LABEL: atomic_xchg_f32_offset: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_load_dword s3, s[4:5], 0x4 +; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: .LBB68_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SI-NEXT: s_cbranch_execnz .LBB68_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end ; SI-NEXT: s_endpgm ; ; VI-LABEL: atomic_xchg_f32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; VI-NEXT: s_load_dword s6, s[0:1], 0x2c +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_load_dword s2, s[8:9], 0x10 +; VI-NEXT: s_add_u32 s0, s8, 16 +; VI-NEXT: s_addc_u32 s1, s9, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: .LBB68_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB68_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xchg_f32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_load_dword s5, s[2:3], 0x10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB68_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr float, ptr addrspace(1) %out, i64 4 Index: llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -15,27 +15,65 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB0_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB0_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xchg_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB0_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v3, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB0_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v[0:1], v2, off +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -49,10 +87,24 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB1_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB1_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -61,18 +113,41 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB1_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v3, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB1_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst @@ -87,28 +162,71 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB2_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB2_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xchg_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB2_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v4 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB2_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -122,31 +240,72 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB3_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB3_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xchg_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB3_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB3_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB3_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst @@ -166,11 +325,25 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB4_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB4_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -184,22 +357,46 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB4_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB4_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB4_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -218,11 +415,25 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB5_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB5_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -238,22 +449,46 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB5_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB5_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v0, v1, s[4:5] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB5_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst @@ -273,11 +508,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB6_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB6_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -291,22 +542,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB6_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB6_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB6_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -325,11 +602,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB7_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB7_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -345,22 +638,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB7_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v3, s34 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB7_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst @@ -438,27 +757,65 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB8_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB8_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xchg_f32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB8_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v3, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB8_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_f32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v[0:1], v2, off +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB8_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst ret void @@ -535,10 +892,24 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB9_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB9_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -547,18 +918,41 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB9_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v3, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB9_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_f32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst @@ -638,28 +1032,71 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB10_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB10_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xchg_f32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB10_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v4 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v4, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB10_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_f32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB10_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst ret float %result @@ -740,31 +1177,72 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB11_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB11_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xchg_f32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB11_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB11_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_f32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB11_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %out, i32 4 %result = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst @@ -858,11 +1336,25 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB12_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB12_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -876,22 +1368,46 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB12_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB12_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_f32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v0, v1, s[4:5] +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB12_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst ret void @@ -988,11 +1504,25 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB13_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB13_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1008,22 +1538,46 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB13_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB13_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_f32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v0, v1, s[4:5] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB13_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst @@ -1120,11 +1674,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB14_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB14_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1138,22 +1708,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB14_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB14_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_f32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst ret float %result @@ -1253,11 +1849,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB15_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB15_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1273,22 +1885,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB15_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v3, s34 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB15_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_f32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[4:5] offset:16 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB15_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(1) %out, i32 4 %result = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst @@ -1307,27 +1945,71 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB16_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB16_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_add_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_add v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB16_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB16_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_add_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB16_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -1341,10 +2023,26 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB17_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB17_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1353,18 +2051,45 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB17_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, v4, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_add v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB17_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_add_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst @@ -1379,28 +2104,74 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB18_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB18_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_add_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB18_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB18_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_add_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_add_u32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB18_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -1414,31 +2185,75 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB19_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB19_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_add_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB19_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB19_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_add_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_add_u32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB19_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst @@ -1458,11 +2273,26 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB20_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_add v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB20_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1476,22 +2306,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB20_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_add v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB20_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_add_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, s6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add v0, v1, s[4:5] +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB20_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -1510,11 +2366,26 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB21_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_add v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB21_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1530,22 +2401,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB21_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_add v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB21_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_add_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, s6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add v0, v1, s[4:5] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB21_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst @@ -1565,11 +2462,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB22_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB22_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1583,22 +2496,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB22_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB22_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_add_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_add_u32_e32 v2, s6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add v0, v0, v1, s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB22_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -1617,11 +2556,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB23_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB23_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1637,22 +2592,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB23_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB23_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_add_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_add_u32_e32 v2, s6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB23_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst @@ -1671,27 +2652,71 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB24_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB24_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB24_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB24_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB24_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -1705,10 +2730,26 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB25_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB25_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1717,18 +2758,45 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB25_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB25_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB25_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst @@ -1743,28 +2811,74 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB26_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB26_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB26_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB26_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB26_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -1778,31 +2892,75 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB27_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB27_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB27_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_sub_u32_e32 v0, vcc, v1, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB27_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB27_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst @@ -1822,11 +2980,26 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB28_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_subrev_i32_e32 v1, vcc, s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB28_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1840,22 +3013,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB28_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB28_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v0, v1, s[4:5] +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB28_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -1874,11 +3073,26 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB29_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_subrev_i32_e32 v1, vcc, s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB29_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1894,22 +3108,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB29_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB29_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v0, v1, s[4:5] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB29_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst @@ -1929,11 +3169,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB30_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_subrev_i32_e32 v3, vcc, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB30_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1947,22 +3203,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB30_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB30_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB30_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -1981,11 +3263,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB31_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_subrev_i32_e32 v3, vcc, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB31_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2001,22 +3299,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB31_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB31_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB31_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst @@ -2035,27 +3359,71 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB32_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB32_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB32_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v3, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB32_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB32_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -2069,10 +3437,26 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB33_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB33_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2081,18 +3465,45 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB33_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v3, v4, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB33_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB33_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst @@ -2107,28 +3518,74 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB34_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB34_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB34_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_and_b32_e32 v3, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB34_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB34_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -2142,31 +3599,75 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB35_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB35_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB35_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_and_b32_e32 v0, v1, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB35_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB35_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst @@ -2186,11 +3687,26 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB36_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB36_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2204,22 +3720,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB36_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB36_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v0, v1, s[4:5] +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB36_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -2238,11 +3780,26 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB37_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB37_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2258,22 +3815,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB37_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB37_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v0, v1, s[4:5] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB37_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst @@ -2293,11 +3876,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB38_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB38_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2311,22 +3910,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB38_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_and_b32_e32 v0, s6, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB38_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_and_b32_e32 v2, s6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v0, v0, v1, s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB38_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -2345,11 +3970,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB39_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB39_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2365,22 +4006,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB39_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_and_b32_e32 v0, s6, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB39_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_and_b32_e32 v2, s6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB39_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst @@ -3130,27 +4797,71 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB48_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB48_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB48_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v3, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB48_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB48_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -3164,10 +4875,26 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB49_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB49_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3176,18 +4903,45 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB49_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v3, v4, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB49_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB49_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst @@ -3202,28 +4956,74 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB50_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB50_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB50_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_or_b32_e32 v3, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB50_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB50_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -3237,31 +5037,75 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB51_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB51_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB51_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_or_b32_e32 v0, v1, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB51_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB51_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst @@ -3281,11 +5125,26 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB52_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB52_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3299,22 +5158,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB52_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB52_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v0, s6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v0, v1, s[4:5] +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB52_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -3333,11 +5218,26 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB53_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB53_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3353,22 +5253,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB53_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB53_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v0, s6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v0, v1, s[4:5] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB53_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst @@ -3388,11 +5314,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB54_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v3, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB54_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3406,22 +5348,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB54_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_or_b32_e32 v0, s6, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB54_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_or_b32_e32 v2, s6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v0, v0, v1, s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB54_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -3440,11 +5408,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB55_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v3, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB55_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3460,22 +5444,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB55_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_or_b32_e32 v0, s6, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB55_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_or_b32_e32 v2, s6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB55_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst @@ -3494,27 +5504,71 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB56_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB56_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB56_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v3, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB56_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB56_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -3528,10 +5582,26 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB57_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB57_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3540,18 +5610,45 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB57_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v3, v4, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB57_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB57_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst @@ -3566,28 +5663,74 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB58_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB58_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB58_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_xor_b32_e32 v3, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB58_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB58_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -3601,31 +5744,75 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB59_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB59_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB59_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_xor_b32_e32 v0, v1, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB59_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB59_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst @@ -3645,11 +5832,26 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB60_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v1, s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB60_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3663,22 +5865,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB60_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB60_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v0, v1, s[4:5] +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB60_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -3697,11 +5925,26 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB61_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v1, s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB61_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3717,22 +5960,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB61_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB61_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v0, v1, s[4:5] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB61_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst @@ -3752,11 +6021,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB62_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v3, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB62_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3770,22 +6055,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB62_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_xor_b32_e32 v0, s6, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB62_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, s6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB62_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -3804,11 +6115,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB63_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v3, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB63_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3824,22 +6151,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB63_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_xor_b32_e32 v0, s6, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB63_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, s6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB63_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst @@ -7871,27 +10224,77 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB107_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; SI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB107_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB107_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB107_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB107_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -7905,10 +10308,28 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB108_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; SI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB108_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7917,18 +10338,49 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB108_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB108_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB108_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -7943,28 +10395,80 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB109_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; SI-NEXT: v_cmp_lt_u32_e32 vcc, v5, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB109_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB109_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB109_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB109_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -7978,31 +10482,81 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB110_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; SI-NEXT: v_cmp_lt_u32_e32 vcc, v5, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB110_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB110_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB110_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB110_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB110_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8022,11 +10576,28 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB111_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v2 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v2 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB111_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8040,22 +10611,52 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB111_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB111_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 1, v1 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v1, s[4:5] +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB111_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -8074,11 +10675,28 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB112_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v2 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v2 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB112_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8094,22 +10712,52 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB112_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB112_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 1, v1 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v1, s[4:5] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB112_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8129,11 +10777,29 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB113_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v4 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB113_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8147,22 +10813,52 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB113_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB113_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 1, v3 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB113_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -8181,11 +10877,29 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB114_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v4 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB114_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8201,22 +10915,52 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB114_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB114_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 1, v3 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB114_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8231,31 +10975,87 @@ ; SI-LABEL: global_atomic_udec_wrap_i32_noret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB115_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB115_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB115_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB115_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: .LBB115_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB115_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -8265,14 +11065,34 @@ ; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB116_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[8:11], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB116_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8281,18 +11101,53 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB116_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB116_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: .LBB116_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB116_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8303,32 +11158,90 @@ ; SI-LABEL: global_atomic_udec_wrap_i32_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB117_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v5, v2 +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB117_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB117_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB117_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB117_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -8338,35 +11251,91 @@ ; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB118_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v5, v2 +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[8:11], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB118_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB118_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2 +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB118_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB118_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8386,11 +11355,31 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: .LBB119_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, -1, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s34 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB119_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8404,22 +11393,58 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB119_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB119_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GFX9-NEXT: v_add_u32_e32 v0, -1, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v1, s[4:5] +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB119_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -8438,11 +11463,31 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: .LBB120_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, -1, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s34 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB120_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8454,26 +11499,62 @@ ; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_add_u32 s34, s4, 16 -; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_add_u32 s36, s4, 16 +; VI-NEXT: s_addc_u32 s37, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s36 +; VI-NEXT: v_mov_b32_e32 v1, s37 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[38:39], 0 +; VI-NEXT: .LBB120_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v2, s36 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, s37 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; VI-NEXT: s_cbranch_execnz .LBB120_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[38:39] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GFX9-NEXT: v_add_u32_e32 v0, -1, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v1, s[4:5] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB120_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8493,11 +11574,32 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: .LBB121_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB121_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8511,22 +11613,58 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB121_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB121_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_add_u32_e32 v2, -1, v3 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v0, v1, s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB121_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -8545,11 +11683,32 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: .LBB122_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB122_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8561,26 +11720,62 @@ ; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_add_u32 s34, s4, 16 -; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_add_u32 s36, s4, 16 +; VI-NEXT: s_addc_u32 s37, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s36 +; VI-NEXT: v_mov_b32_e32 v1, s37 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[38:39], 0 +; VI-NEXT: .LBB122_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v2, s36 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, s37 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; VI-NEXT: s_cbranch_execnz .LBB122_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[38:39] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_add_u32_e32 v2, -1, v3 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB122_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst Index: llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -3750,42 +3750,95 @@ ; CI-LABEL: atomic_xchg_f64_offset: ; CI: ; %bb.0: ; %entry ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b64 s[4:5], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x8 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s8 +; CI-NEXT: v_mov_b32_e32 v3, s9 +; CI-NEXT: .LBB65_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v0, s6 +; CI-NEXT: v_mov_b32_e32 v1, s7 +; CI-NEXT: v_mov_b32_e32 v7, v3 +; CI-NEXT: v_mov_b32_e32 v6, v2 +; CI-NEXT: v_mov_b32_e32 v5, v1 +; CI-NEXT: v_mov_b32_e32 v4, v0 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32 +; CI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; CI-NEXT: v_mov_b32_e32 v2, v4 +; CI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CI-NEXT: v_mov_b32_e32 v3, v5 +; CI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CI-NEXT: s_cbranch_execnz .LBB65_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_xchg_f64_offset: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x20 +; VI-NEXT: s_add_u32 s4, s0, 32 +; VI-NEXT: s_addc_u32 s5, s1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: .LBB65_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v7, v3 +; VI-NEXT: v_mov_b32_e32 v6, v2 +; VI-NEXT: v_mov_b32_e32 v5, v1 +; VI-NEXT: v_mov_b32_e32 v4, v0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32 +; VI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; VI-NEXT: v_mov_b32_e32 v2, v4 +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: v_mov_b32_e32 v3, v5 +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB65_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xchg_f64_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB65_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr double, ptr addrspace(1) %out, i64 4 @@ -3797,42 +3850,95 @@ ; CI-LABEL: atomic_xchg_pointer_offset: ; CI: ; %bb.0: ; %entry ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b64 s[4:5], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x8 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s8 +; CI-NEXT: v_mov_b32_e32 v3, s9 +; CI-NEXT: .LBB66_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v0, s6 +; CI-NEXT: v_mov_b32_e32 v1, s7 +; CI-NEXT: v_mov_b32_e32 v7, v3 +; CI-NEXT: v_mov_b32_e32 v6, v2 +; CI-NEXT: v_mov_b32_e32 v5, v1 +; CI-NEXT: v_mov_b32_e32 v4, v0 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32 +; CI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; CI-NEXT: v_mov_b32_e32 v2, v4 +; CI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CI-NEXT: v_mov_b32_e32 v3, v5 +; CI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CI-NEXT: s_cbranch_execnz .LBB66_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_xchg_pointer_offset: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x20 +; VI-NEXT: s_add_u32 s4, s0, 32 +; VI-NEXT: s_addc_u32 s5, s1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: .LBB66_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v7, v3 +; VI-NEXT: v_mov_b32_e32 v6, v2 +; VI-NEXT: v_mov_b32_e32 v5, v1 +; VI-NEXT: v_mov_b32_e32 v4, v0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32 +; VI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; VI-NEXT: v_mov_b32_e32 v2, v4 +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: v_mov_b32_e32 v3, v5 +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB66_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_xchg_pointer_offset: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB66_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: %gep = getelementptr ptr, ptr addrspace(1) %out, i64 4 Index: llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -15,27 +15,70 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB0_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB0_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xchg_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB0_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v6 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v5, v7 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB0_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -49,10 +92,27 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB1_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB1_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -61,18 +121,43 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB1_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; VI-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v6 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v5, v7 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB1_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB1_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst @@ -87,29 +172,79 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB2_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB2_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xchg_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB2_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v6 +; VI-NEXT: v_mov_b32_e32 v5, v7 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB2_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v6 +; VI-NEXT: v_mov_b32_e32 v1, v7 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB2_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -123,32 +258,79 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB3_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB3_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xchg_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[6:7] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB3_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, v1 +; VI-NEXT: v_mov_b32_e32 v4, v0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[2:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB3_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB3_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst @@ -165,16 +347,33 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB4_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v2, s35 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB4_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -186,26 +385,52 @@ ; VI-LABEL: global_atomic_xchg_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB4_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB4_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB4_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -221,14 +446,33 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB5_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v2, s35 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB5_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -242,26 +486,52 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB5_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s34 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB5_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB5_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst @@ -278,16 +548,36 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB6_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: v_mov_b32_e32 v6, s35 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB6_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -299,26 +589,54 @@ ; VI-LABEL: global_atomic_xchg_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB6_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: global_atomic_xchg_i64_ret_scalar: -; GFX9: ; %bb.0: +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB6_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: global_atomic_xchg_i64_ret_scalar: +; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB6_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -334,14 +652,36 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB7_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: v_mov_b32_e32 v6, s35 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB7_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -355,26 +695,54 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB7_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB7_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst @@ -452,27 +820,70 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB8_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB8_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xchg_f64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB8_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v6 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v5, v7 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB8_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_f64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB8_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst ret void @@ -549,10 +960,27 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB9_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB9_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -561,18 +989,43 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB9_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; VI-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v6 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v5, v7 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB9_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_f64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst @@ -652,29 +1105,79 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB10_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB10_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xchg_f64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB10_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v6 +; VI-NEXT: v_mov_b32_e32 v5, v7 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB10_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v6 +; VI-NEXT: v_mov_b32_e32 v1, v7 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_f64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB10_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst ret double %result @@ -755,32 +1258,79 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB11_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB11_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xchg_f64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[6:7] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB11_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, v1 +; VI-NEXT: v_mov_b32_e32 v4, v0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[2:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB11_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_f64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[6:7], v[0:1], v[2:5], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB11_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(1) %out, i32 4 %result = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst @@ -871,16 +1421,33 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB12_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v2, s35 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB12_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -892,26 +1459,52 @@ ; VI-LABEL: global_atomic_xchg_f64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB12_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB12_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_f64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB12_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst ret void @@ -1005,19 +1598,38 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0 offset:32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 -; SI-NEXT: v_readlane_b32 s6, v0, 0 -; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: s_mov_b64 exec, s[34:35] +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB13_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v2, s35 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB13_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v0, 1 +; SI-NEXT: v_readlane_b32 s6, v0, 0 +; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1026,26 +1638,52 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB13_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s34 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB13_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_f64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB13_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst @@ -1139,16 +1777,36 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB14_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: v_mov_b32_e32 v6, s35 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB14_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1160,26 +1818,54 @@ ; VI-LABEL: global_atomic_xchg_f64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB14_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB14_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_f64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst ret double %result @@ -1276,14 +1962,36 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB15_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: v_mov_b32_e32 v6, s35 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB15_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1297,26 +2005,54 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB15_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB15_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xchg_f64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB15_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(1) %out, i32 4 %result = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst @@ -1335,27 +2071,79 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB16_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, v6, v2 +; SI-NEXT: v_addc_u32_e32 v5, vcc, v7, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB16_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_add_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB16_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_addc_u32_e32 v5, vcc, v7, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB16_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_add_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB16_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -1369,10 +2157,30 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB17_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, v6, v2 +; SI-NEXT: v_addc_u32_e32 v5, vcc, v7, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB17_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1381,18 +2189,49 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB17_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_addc_u32_e32 v5, vcc, v7, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB17_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_add_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst @@ -1403,33 +2242,91 @@ ; SI-LABEL: global_atomic_add_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB18_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, v10, v5 +; SI-NEXT: v_addc_u32_e32 v9, vcc, v11, v4, vcc ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB18_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; ; VI-LABEL: global_atomic_add_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB18_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_addc_u32_e32 v5, vcc, v7, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB18_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_add_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB18_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -1439,36 +2336,91 @@ ; SI-LABEL: global_atomic_add_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB19_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, v10, v5 +; SI-NEXT: v_addc_u32_e32 v9, vcc, v11, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB19_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_add_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB19_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, v8, v2 +; VI-NEXT: v_addc_u32_e32 v7, vcc, v9, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB19_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_add_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB19_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst @@ -1485,16 +2437,35 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB20_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v2, s35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, s34, v3 +; SI-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_add_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB20_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1506,26 +2477,56 @@ ; VI-LABEL: global_atomic_add_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB20_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB20_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_add_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB20_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -1541,14 +2542,35 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB21_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v2, s35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, s34, v3 +; SI-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_add_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB21_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1562,26 +2584,56 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB21_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB21_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_add_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB21_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst @@ -1598,16 +2650,37 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB22_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, s34, v7 +; SI-NEXT: v_addc_u32_e32 v6, vcc, v8, v0, vcc +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB22_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1619,26 +2692,56 @@ ; VI-LABEL: global_atomic_add_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB22_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB22_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_add_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s6, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB22_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -1654,14 +2757,37 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB23_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, s34, v7 +; SI-NEXT: v_addc_u32_e32 v6, vcc, v8, v0, vcc +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB23_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1675,26 +2801,56 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB23_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB23_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_add_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s6, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB23_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst @@ -1713,27 +2869,79 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB24_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; SI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB24_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB24_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB24_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB24_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -1747,10 +2955,30 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB25_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; SI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB25_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1759,18 +2987,49 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB25_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB25_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB25_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst @@ -1781,33 +3040,91 @@ ; SI-LABEL: global_atomic_sub_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB26_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_sub_i32_e32 v8, vcc, v10, v5 +; SI-NEXT: v_subb_u32_e32 v9, vcc, v11, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB26_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB26_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB26_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB26_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -1817,36 +3134,91 @@ ; SI-LABEL: global_atomic_sub_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB27_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_sub_i32_e32 v8, vcc, v10, v5 +; SI-NEXT: v_subb_u32_e32 v9, vcc, v11, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB27_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB27_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_sub_u32_e32 v6, vcc, v8, v2 +; VI-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB27_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB27_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst @@ -1863,17 +3235,36 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB28_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v2, s35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_subrev_i32_e32 v1, vcc, s34, v3 +; SI-NEXT: v_subb_u32_e32 v2, vcc, v4, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v0, 1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB28_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload @@ -1884,26 +3275,56 @@ ; VI-LABEL: global_atomic_sub_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB28_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB28_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB28_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -1919,14 +3340,35 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB29_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v2, s35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_subrev_i32_e32 v1, vcc, s34, v3 +; SI-NEXT: v_subb_u32_e32 v2, vcc, v4, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB29_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1940,26 +3382,56 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB29_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB29_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB29_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst @@ -1976,16 +3448,37 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB30_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_subrev_i32_e32 v5, vcc, s34, v7 +; SI-NEXT: v_subb_u32_e32 v6, vcc, v8, v0, vcc +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB30_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1997,26 +3490,56 @@ ; VI-LABEL: global_atomic_sub_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB30_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB30_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s6, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v6, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB30_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -2032,14 +3555,37 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB31_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_subrev_i32_e32 v5, vcc, s34, v7 +; SI-NEXT: v_subb_u32_e32 v6, vcc, v8, v0, vcc +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB31_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2053,26 +3599,56 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB31_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB31_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s6, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v6, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB31_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst @@ -2091,27 +3667,79 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB32_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, v7, v3 +; SI-NEXT: v_and_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB32_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB32_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v5, v7, v3 +; VI-NEXT: v_and_b32_e32 v4, v6, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB32_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB32_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -2125,30 +3753,81 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB33_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, v7, v3 +; SI-NEXT: v_and_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB33_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; ; VI-LABEL: global_atomic_and_i64_noret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB33_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v5, v7, v3 +; VI-NEXT: v_and_b32_e32 v4, v6, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB33_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB33_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst @@ -2159,33 +3838,91 @@ ; SI-LABEL: global_atomic_and_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB34_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_and_b32_e32 v9, v11, v4 +; SI-NEXT: v_and_b32_e32 v8, v10, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB34_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB34_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_and_b32_e32 v5, v7, v3 +; VI-NEXT: v_and_b32_e32 v4, v6, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB34_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB34_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -2195,36 +3932,91 @@ ; SI-LABEL: global_atomic_and_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB35_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_and_b32_e32 v9, v11, v4 +; SI-NEXT: v_and_b32_e32 v8, v10, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB35_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB35_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_and_b32_e32 v7, v9, v3 +; VI-NEXT: v_and_b32_e32 v6, v8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB35_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB35_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst @@ -2241,16 +4033,34 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB36_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, s35, v4 +; SI-NEXT: v_and_b32_e32 v1, s34, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB36_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2262,26 +4072,54 @@ ; VI-LABEL: global_atomic_and_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB36_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v1, s7, v3 +; VI-NEXT: v_and_b32_e32 v0, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB36_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB36_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -2297,14 +4135,34 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB37_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, s35, v4 +; SI-NEXT: v_and_b32_e32 v1, s34, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB37_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2318,26 +4176,54 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB37_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v1, s7, v3 +; VI-NEXT: v_and_b32_e32 v0, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB37_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB37_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst @@ -2354,18 +4240,38 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 -; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB38_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, s35, v8 +; SI-NEXT: v_and_b32_e32 v5, s34, v7 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB38_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 +; SI-NEXT: v_readlane_b32 s7, v2, 1 +; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] @@ -2375,26 +4281,54 @@ ; VI-LABEL: global_atomic_and_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB38_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_and_b32_e32 v1, s7, v3 +; VI-NEXT: v_and_b32_e32 v0, s6, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB38_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_and_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_and_b32_e32 v3, s6, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB38_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -2410,14 +4344,36 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB39_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, s35, v8 +; SI-NEXT: v_and_b32_e32 v5, s34, v7 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB39_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2431,26 +4387,54 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB39_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_and_b32_e32 v1, s7, v3 +; VI-NEXT: v_and_b32_e32 v0, s6, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB39_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_and_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_and_b32_e32 v3, s6, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB39_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst @@ -3303,27 +5287,79 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB48_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB48_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB48_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v5, v7, v3 +; VI-NEXT: v_or_b32_e32 v4, v6, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB48_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB48_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -3337,10 +5373,30 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB49_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB49_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3349,18 +5405,49 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB49_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v5, v7, v3 +; VI-NEXT: v_or_b32_e32 v4, v6, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB49_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB49_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst @@ -3371,33 +5458,91 @@ ; SI-LABEL: global_atomic_or_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB50_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_or_b32_e32 v9, v11, v4 +; SI-NEXT: v_or_b32_e32 v8, v10, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB50_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB50_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_or_b32_e32 v5, v7, v3 +; VI-NEXT: v_or_b32_e32 v4, v6, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB50_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB50_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -3407,36 +5552,91 @@ ; SI-LABEL: global_atomic_or_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB51_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_or_b32_e32 v9, v11, v4 +; SI-NEXT: v_or_b32_e32 v8, v10, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB51_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB51_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_or_b32_e32 v7, v9, v3 +; VI-NEXT: v_or_b32_e32 v6, v8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB51_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB51_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst @@ -3453,16 +5653,34 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB52_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, s35, v4 +; SI-NEXT: v_or_b32_e32 v1, s34, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB52_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3474,26 +5692,54 @@ ; VI-LABEL: global_atomic_or_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB52_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v1, s7, v3 +; VI-NEXT: v_or_b32_e32 v0, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB52_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB52_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -3509,14 +5755,34 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB53_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, s35, v4 +; SI-NEXT: v_or_b32_e32 v1, s34, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB53_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3530,26 +5796,54 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB53_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v1, s7, v3 +; VI-NEXT: v_or_b32_e32 v0, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB53_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB53_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst @@ -3566,16 +5860,36 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB54_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v6, s35, v8 +; SI-NEXT: v_or_b32_e32 v5, s34, v7 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB54_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3587,26 +5901,54 @@ ; VI-LABEL: global_atomic_or_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB54_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_or_b32_e32 v1, s7, v3 +; VI-NEXT: v_or_b32_e32 v0, s6, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB54_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_or_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_or_b32_e32 v3, s6, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB54_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -3622,14 +5964,36 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB55_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v6, s35, v8 +; SI-NEXT: v_or_b32_e32 v5, s34, v7 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB55_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3643,26 +6007,54 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB55_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_or_b32_e32 v1, s7, v3 +; VI-NEXT: v_or_b32_e32 v0, s6, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB55_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_or_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_or_b32_e32 v3, s6, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB55_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst @@ -3681,27 +6073,79 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB56_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v5, v7, v3 +; SI-NEXT: v_xor_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB56_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB56_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v5, v7, v3 +; VI-NEXT: v_xor_b32_e32 v4, v6, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB56_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB56_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -3715,10 +6159,30 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB57_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v5, v7, v3 +; SI-NEXT: v_xor_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB57_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3727,18 +6191,49 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB57_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v5, v7, v3 +; VI-NEXT: v_xor_b32_e32 v4, v6, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB57_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB57_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst @@ -3749,33 +6244,91 @@ ; SI-LABEL: global_atomic_xor_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB58_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_xor_b32_e32 v9, v11, v4 +; SI-NEXT: v_xor_b32_e32 v8, v10, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB58_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB58_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_xor_b32_e32 v5, v7, v3 +; VI-NEXT: v_xor_b32_e32 v4, v6, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB58_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB58_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -3785,36 +6338,91 @@ ; SI-LABEL: global_atomic_xor_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB59_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_xor_b32_e32 v9, v11, v4 +; SI-NEXT: v_xor_b32_e32 v8, v10, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB59_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB59_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_xor_b32_e32 v7, v9, v3 +; VI-NEXT: v_xor_b32_e32 v6, v8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB59_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB59_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst @@ -3831,16 +6439,34 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB60_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v2, s35, v4 +; SI-NEXT: v_xor_b32_e32 v1, s34, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB60_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3852,26 +6478,54 @@ ; VI-LABEL: global_atomic_xor_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB60_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v1, s7, v3 +; VI-NEXT: v_xor_b32_e32 v0, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB60_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB60_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -3887,14 +6541,34 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB61_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v2, s35, v4 +; SI-NEXT: v_xor_b32_e32 v1, s34, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB61_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3908,26 +6582,54 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB61_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v1, s7, v3 +; VI-NEXT: v_xor_b32_e32 v0, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB61_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB61_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst @@ -3944,16 +6646,36 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB62_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v6, s35, v8 +; SI-NEXT: v_xor_b32_e32 v5, s34, v7 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB62_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3965,26 +6687,54 @@ ; VI-LABEL: global_atomic_xor_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB62_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_xor_b32_e32 v1, s7, v3 +; VI-NEXT: v_xor_b32_e32 v0, s6, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB62_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_xor_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_xor_b32_e32 v3, s6, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB62_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -4000,14 +6750,36 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB63_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v6, s35, v8 +; SI-NEXT: v_xor_b32_e32 v5, s34, v7 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB63_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -4021,26 +6793,54 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB63_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_xor_b32_e32 v1, s7, v3 +; VI-NEXT: v_xor_b32_e32 v0, s6, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB63_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_xor_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_xor_b32_e32 v3, s6, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB63_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst @@ -8733,27 +11533,88 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB107_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB107_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB107_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB107_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB107_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -8767,10 +11628,33 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB108_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB108_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8779,18 +11663,55 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB108_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB108_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB108_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -8801,33 +11722,100 @@ ; SI-LABEL: global_atomic_uinc_wrap_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB109_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v11, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB109_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB109_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB109_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB109_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -8837,36 +11825,100 @@ ; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB110_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v11, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB110_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB110_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v8 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB110_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB110_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB110_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -8883,16 +11935,37 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB111_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v3 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[3:4] +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB111_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8904,26 +11977,60 @@ ; VI-LABEL: global_atomic_uinc_wrap_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB111_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB111_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB111_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -8939,14 +12046,37 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB112_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v3 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[3:4] +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB112_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8960,26 +12090,60 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB112_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB112_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB112_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -8996,16 +12160,39 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB113_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v7 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[7:8] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB113_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -9017,26 +12204,60 @@ ; VI-LABEL: global_atomic_uinc_wrap_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB113_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB113_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB113_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -9052,14 +12273,39 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB114_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v7 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[7:8] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB114_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -9073,26 +12319,60 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB114_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB114_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB114_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9107,31 +12387,98 @@ ; SI-LABEL: global_atomic_udec_wrap_i64_noret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB115_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, -1, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB115_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: .LBB115_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; VI-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB115_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB115_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB115_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -9141,14 +12488,39 @@ ; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[8:11], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB116_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, -1, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB116_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -9157,18 +12529,59 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: .LBB116_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; VI-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB116_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB116_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB116_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9178,34 +12591,107 @@ define i64 @global_atomic_udec_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-LABEL: global_atomic_udec_wrap_i64_ret: ; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB117_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v11, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[4:5] +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, v1, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB117_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: .LBB117_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; VI-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB117_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB117_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -9215,36 +12701,106 @@ ; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[8:11], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB118_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v11, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[4:5] +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, v1, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[8:11], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB118_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: .LBB118_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; VI-NEXT: v_add_u32_e64 v0, s[6:7], -1, v8 +; VI-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB118_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB118_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9261,16 +12817,41 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: .LBB119_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, -1, v3 +; SI-NEXT: v_addc_u32_e32 v2, vcc, -1, v4, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[3:4] +; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[3:4] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, s35 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB119_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -9282,26 +12863,68 @@ ; VI-LABEL: global_atomic_udec_wrap_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB119_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB119_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB119_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -9317,14 +12940,41 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: .LBB120_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, -1, v3 +; SI-NEXT: v_addc_u32_e32 v2, vcc, -1, v4, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[3:4] +; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[3:4] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, s35 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB120_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -9336,28 +12986,70 @@ ; VI-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_add_u32 s34, s4, 32 -; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: s_add_u32 s36, s4, 32 +; VI-NEXT: s_addc_u32 s37, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s36 +; VI-NEXT: v_mov_b32_e32 v1, s37 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[38:39], 0 +; VI-NEXT: .LBB120_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v4, s36 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v5, s37 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; VI-NEXT: s_cbranch_execnz .LBB120_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[38:39] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB120_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9374,16 +13066,43 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: .LBB121_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v7 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v8, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[7:8] +; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[7:8] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s35 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v6, v1, v3, vcc ; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB121_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -9395,26 +13114,68 @@ ; VI-LABEL: global_atomic_udec_wrap_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[38:39], 0 +; VI-NEXT: .LBB121_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v7, s[36:37], -1, v2 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; VI-NEXT: s_cbranch_execnz .LBB121_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[38:39] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[5:6] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[5:6] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[36:37], -1, v5 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[36:37], -1, v6, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB121_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -9430,14 +13191,43 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: .LBB122_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v7 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v8, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[7:8] +; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[7:8] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s35 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v6, v1, v3, vcc +; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB122_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -9449,28 +13239,70 @@ ; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_add_u32 s34, s4, 32 -; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: s_add_u32 s38, s4, 32 +; VI-NEXT: s_addc_u32 s39, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s38 +; VI-NEXT: v_mov_b32_e32 v1, s39 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[40:41], 0 +; VI-NEXT: .LBB122_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v7, s[36:37], -1, v2 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s38 +; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v5, s39 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; VI-NEXT: s_andn2_b64 exec, exec, s[40:41] +; VI-NEXT: s_cbranch_execnz .LBB122_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[40:41] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[5:6] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[5:6] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[36:37], -1, v5 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[36:37], -1, v6, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB122_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -1082,12 +1082,26 @@ ; GFX7-LABEL: flat_system_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw: @@ -1095,11 +1109,25 @@ ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: flat_load_dword v1, v[0:1] +; GFX10-WGP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw: @@ -1107,62 +1135,142 @@ ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: flat_load_dword v1, v[0:1] +; GFX10-CU-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_load_dword v1, v[0:1] +; SKIP-CACHE-INV-NEXT: .LBB8_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB8_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-TGSPLIT-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-TGSPLIT-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_monotonic_atomicrmw: @@ -1170,10 +1278,24 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-WGP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_monotonic_atomicrmw: @@ -1181,10 +1303,24 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-CU-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -1196,14 +1332,27 @@ ; GFX7-LABEL: flat_system_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw: @@ -1211,15 +1360,27 @@ ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_load_dword v1, v[0:1] +; GFX10-WGP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_system_acquire_atomicrmw: @@ -1227,77 +1388,150 @@ ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_load_dword v1, v[0:1] +; GFX10-CU-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_load_dword v1, v[0:1] +; SKIP-CACHE-INV-NEXT: .LBB9_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB9_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-TGSPLIT-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_system_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-TGSPLIT-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_acquire_atomicrmw: @@ -1305,14 +1539,26 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-WGP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_acquire_atomicrmw: @@ -1320,14 +1566,26 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-CU-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -1339,13 +1597,26 @@ ; GFX7-LABEL: flat_system_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_release_atomicrmw: @@ -1353,13 +1624,26 @@ ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_load_dword v1, v[0:1] +; GFX10-WGP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_system_release_atomicrmw: @@ -1367,73 +1651,147 @@ ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_load_dword v1, v[0:1] +; GFX10-CU-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_load_dword v1, v[0:1] +; SKIP-CACHE-INV-NEXT: .LBB10_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB10_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-TGSPLIT-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_system_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-TGSPLIT-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_release_atomicrmw: @@ -1441,12 +1799,25 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-WGP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_release_atomicrmw: @@ -1454,12 +1825,25 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-CU-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -1471,15 +1855,27 @@ ; GFX7-LABEL: flat_system_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw: @@ -1487,17 +1883,28 @@ ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_load_dword v1, v[0:1] +; GFX10-WGP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw: @@ -1505,88 +1912,155 @@ ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_load_dword v1, v[0:1] +; GFX10-CU-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_load_dword v1, v[0:1] +; SKIP-CACHE-INV-NEXT: .LBB11_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB11_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-TGSPLIT-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-TGSPLIT-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_acq_rel_atomicrmw: @@ -1594,16 +2068,27 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-WGP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_acq_rel_atomicrmw: @@ -1611,16 +2096,27 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-CU-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -1632,15 +2128,27 @@ ; GFX7-LABEL: flat_system_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw: @@ -1648,17 +2156,28 @@ ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_load_dword v1, v[0:1] +; GFX10-WGP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw: @@ -1666,88 +2185,155 @@ ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_load_dword v1, v[0:1] +; GFX10-CU-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_load_dword v1, v[0:1] +; SKIP-CACHE-INV-NEXT: .LBB12_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB12_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-TGSPLIT-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-TGSPLIT-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_seq_cst_atomicrmw: @@ -1755,16 +2341,27 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-WGP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 -; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_seq_cst_atomicrmw: @@ -1772,16 +2369,27 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-CU-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 -; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -1793,15 +2401,31 @@ ; GFX7-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: flat_store_dword v[1:2], v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw: @@ -1809,15 +2433,31 @@ ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: flat_load_dword v0, v[0:1] +; GFX10-WGP-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: flat_store_dword v[1:2], v0 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw: @@ -1825,82 +2465,170 @@ ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: flat_load_dword v0, v[0:1] +; GFX10-CU-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-CU-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: flat_store_dword v[1:2], v0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: .LBB13_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v4, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB13_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end +; SKIP-CACHE-INV-NEXT: s_or_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[1:2], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-TGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX940-TGSPLIT-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-TGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_acquire_ret_atomicrmw: @@ -1908,14 +2636,28 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: flat_load_b32 v0, v[0:1] +; GFX11-WGP-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, v0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[1:2] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX11-WGP-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_acquire_ret_atomicrmw: @@ -1923,14 +2665,28 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: flat_load_b32 v0, v[0:1] +; GFX11-CU-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, v0 +; GFX11-CU-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[1:2] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-CU-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX11-CU-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -1943,16 +2699,32 @@ ; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: flat_store_dword v[1:2], v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw: @@ -1960,17 +2732,33 @@ ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_load_dword v0, v[0:1] +; GFX10-WGP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: flat_store_dword v[1:2], v0 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw: @@ -1978,93 +2766,181 @@ ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_load_dword v0, v[0:1] +; GFX10-CU-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-CU-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: flat_store_dword v[1:2], v0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: .LBB14_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v4, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB14_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end +; SKIP-CACHE-INV-NEXT: s_or_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[1:2], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-TGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX940-TGSPLIT-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-TGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw: @@ -2072,16 +2948,30 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: flat_load_b32 v0, v[0:1] +; GFX11-WGP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, v0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[1:2] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX11-WGP-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_acq_rel_ret_atomicrmw: @@ -2089,16 +2979,30 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: flat_load_b32 v0, v[0:1] +; GFX11-CU-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, v0 +; GFX11-CU-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[1:2] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-CU-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX11-CU-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -2111,16 +3015,32 @@ ; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: flat_store_dword v[1:2], v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw: @@ -2128,17 +3048,33 @@ ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_load_dword v0, v[0:1] +; GFX10-WGP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: flat_store_dword v[1:2], v0 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw: @@ -2146,93 +3082,181 @@ ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_load_dword v0, v[0:1] +; GFX10-CU-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-CU-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: flat_store_dword v[1:2], v0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: .LBB15_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v4, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB15_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end +; SKIP-CACHE-INV-NEXT: s_or_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[1:2], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-TGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX940-TGSPLIT-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-TGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw: @@ -2240,16 +3264,30 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: flat_load_b32 v0, v[0:1] +; GFX11-WGP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, v0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[1:2] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX11-WGP-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_seq_cst_ret_atomicrmw: @@ -2257,16 +3295,30 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: flat_load_b32 v0, v[0:1] +; GFX11-CU-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, v0 +; GFX11-CU-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[1:2] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-CU-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX11-CU-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -8171,12 +9223,26 @@ ; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw: @@ -8184,11 +9250,25 @@ ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: flat_load_dword v1, v[0:1] +; GFX10-WGP-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: @@ -8196,62 +9276,142 @@ ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: flat_load_dword v1, v[0:1] +; GFX10-CU-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_load_dword v1, v[0:1] +; SKIP-CACHE-INV-NEXT: .LBB54_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB54_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-TGSPLIT-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-TGSPLIT-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw: @@ -8259,10 +9419,24 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-WGP-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: @@ -8270,10 +9444,24 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-CU-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -8285,14 +9473,28 @@ ; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw: @@ -8300,14 +9502,28 @@ ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_load_dword v1, v[0:1] +; GFX10-WGP-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw: @@ -8315,76 +9531,153 @@ ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_load_dword v1, v[0:1] +; GFX10-CU-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_load_dword v1, v[0:1] +; SKIP-CACHE-INV-NEXT: .LBB55_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB55_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-TGSPLIT-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-TGSPLIT-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_one_as_acquire_atomicrmw: @@ -8392,13 +9685,27 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 -; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-WGP-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_atomicrmw: @@ -8406,13 +9713,27 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 -; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-CU-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -8424,13 +9745,26 @@ ; GFX7-LABEL: flat_system_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw: @@ -8438,13 +9772,26 @@ ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: flat_load_dword v1, v[0:1] +; GFX10-WGP-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw: @@ -8452,73 +9799,147 @@ ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_load_dword v1, v[0:1] +; GFX10-CU-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_load_dword v1, v[0:1] +; SKIP-CACHE-INV-NEXT: .LBB56_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB56_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-TGSPLIT-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-TGSPLIT-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_one_as_release_atomicrmw: @@ -8526,12 +9947,25 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-WGP-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_release_atomicrmw: @@ -8539,12 +9973,25 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-CU-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -8556,15 +10003,28 @@ ; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB57_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw: @@ -8572,16 +10032,29 @@ ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: flat_load_dword v1, v[0:1] +; GFX10-WGP-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: @@ -8589,87 +10062,158 @@ ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: flat_load_dword v1, v[0:1] +; GFX10-CU-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB57_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_load_dword v1, v[0:1] +; SKIP-CACHE-INV-NEXT: .LBB57_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB57_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-TGSPLIT-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB57_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB57_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-TGSPLIT-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB57_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw: @@ -8677,15 +10221,28 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-WGP-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: @@ -8693,15 +10250,28 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) -; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-CU-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB57_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -8713,15 +10283,28 @@ ; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB58_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw: @@ -8729,16 +10312,29 @@ ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: flat_load_dword v1, v[0:1] +; GFX10-WGP-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB58_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: @@ -8746,87 +10342,158 @@ ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: flat_load_dword v1, v[0:1] +; GFX10-CU-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB58_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_load_dword v1, v[0:1] +; SKIP-CACHE-INV-NEXT: .LBB58_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB58_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB58_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-TGSPLIT-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB58_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB58_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: flat_load_dword v1, v[0:1] +; GFX940-TGSPLIT-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1 +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB58_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw: @@ -8834,15 +10501,28 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-WGP-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: @@ -8850,15 +10530,28 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: s_waitcnt vmcnt(0) -; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: flat_load_b32 v1, v[0:1] +; GFX11-CU-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB58_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -8870,16 +10563,32 @@ ; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB59_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: flat_store_dword v[1:2], v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw: @@ -8887,16 +10596,32 @@ ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: flat_load_dword v0, v[0:1] +; GFX10-WGP-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB59_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: flat_store_dword v[1:2], v0 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: @@ -8904,85 +10629,173 @@ ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: flat_load_dword v0, v[0:1] +; GFX10-CU-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB59_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-CU-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: flat_store_dword v[1:2], v0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: .LBB59_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v4, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB59_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end +; SKIP-CACHE-INV-NEXT: s_or_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[1:2], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB59_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB59_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-TGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB59_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX940-TGSPLIT-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 -; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB59_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-TGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw: @@ -8990,15 +10803,29 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: flat_load_b32 v0, v[0:1] +; GFX11-WGP-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, v0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[1:2] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX11-WGP-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: @@ -9006,15 +10833,29 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: flat_load_b32 v0, v[0:1] +; GFX11-CU-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, v0 +; GFX11-CU-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[1:2] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB59_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-CU-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX11-CU-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -9027,17 +10868,33 @@ ; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB60_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: flat_store_dword v[1:2], v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: @@ -9045,18 +10902,34 @@ ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_load_dword v0, v[0:1] +; GFX10-WGP-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB60_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: flat_store_dword v[1:2], v0 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: @@ -9064,96 +10937,184 @@ ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_load_dword v0, v[0:1] +; GFX10-CU-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB60_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-CU-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: flat_store_dword v[1:2], v0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: .LBB60_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v4, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB60_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end +; SKIP-CACHE-INV-NEXT: s_or_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[1:2], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB60_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB60_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-TGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB60_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX940-TGSPLIT-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB60_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-TGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: @@ -9161,17 +11122,31 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: flat_load_b32 v0, v[0:1] +; GFX11-WGP-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, v0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[1:2] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX11-WGP-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: @@ -9179,17 +11154,31 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: flat_load_b32 v0, v[0:1] +; GFX11-CU-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, v0 +; GFX11-CU-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[1:2] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB60_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-CU-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX11-CU-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -9202,17 +11191,33 @@ ; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB61_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: flat_store_dword v[1:2], v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: @@ -9220,18 +11225,34 @@ ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_load_dword v0, v[0:1] +; GFX10-WGP-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB61_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: flat_store_dword v[1:2], v0 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: @@ -9239,96 +11260,184 @@ ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_load_dword v0, v[0:1] +; GFX10-CU-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v4, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB61_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-CU-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: flat_store_dword v[1:2], v0 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: .LBB61_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v4, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB61_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end +; SKIP-CACHE-INV-NEXT: s_or_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[1:2], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB61_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB61_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-TGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX940-NOTTGSPLIT-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB61_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX940-TGSPLIT-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB61_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-TGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX940-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: @@ -9336,17 +11445,31 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: flat_load_b32 v0, v[0:1] +; GFX11-WGP-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, v0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[1:2] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX11-WGP-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: @@ -9354,17 +11477,31 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: flat_load_b32 v0, v[0:1] +; GFX11-CU-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, v0 +; GFX11-CU-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc +; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[1:2] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v2 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB61_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-CU-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX11-CU-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -1127,121 +1127,269 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX6-LABEL: global_system_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 +; GFX10-WGP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 +; GFX10-CU-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: .LBB8_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB8_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NOTTGSPLIT-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-TGSPLIT-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-WGP-NEXT: s_nop 0 -; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 +; GFX11-WGP-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-CU-NEXT: s_nop 0 -; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 +; GFX11-CU-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -1252,144 +1400,285 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX6-LABEL: global_system_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB9_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 +; GFX10-WGP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 +; GFX10-CU-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: .LBB9_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB9_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NOTTGSPLIT-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-TGSPLIT-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 +; GFX11-WGP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 +; GFX11-CU-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -1400,140 +1689,288 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX6-LABEL: global_system_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB10_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 +; GFX10-WGP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 +; GFX10-CU-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: .LBB10_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB10_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NOTTGSPLIT-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-TGSPLIT-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 +; GFX11-WGP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-WGP-NEXT: s_nop 0 -; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 +; GFX11-CU-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-CU-NEXT: s_nop 0 -; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -1544,163 +1981,304 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX6-LABEL: global_system_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB11_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 +; GFX10-WGP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 +; GFX10-CU-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: .LBB11_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB11_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NOTTGSPLIT-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-TGSPLIT-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 +; GFX11-WGP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 +; GFX11-CU-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -1711,163 +2289,304 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX6-LABEL: global_system_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB12_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 +; GFX10-WGP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 +; GFX10-CU-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_atomicrmw: -; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: .LBB12_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB12_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NOTTGSPLIT-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-TGSPLIT-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 +; GFX11-WGP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 +; GFX11-CU-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -1878,141 +2597,291 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX6-LABEL: global_system_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB13_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: buffer_store_dword v2, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: flat_store_dword v[1:2], v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-WGP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 +; GFX10-WGP-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-CU-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 +; GFX10-CU-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-CU-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: .LBB13_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB13_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end +; SKIP-CACHE-INV-NEXT: s_or_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-TGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX940-NOTTGSPLIT-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX940-TGSPLIT-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-TGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 +; GFX11-WGP-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2020,15 +2889,31 @@ ; GFX11-CU-LABEL: global_system_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 +; GFX11-CU-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-CU-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2042,158 +2927,308 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB14_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: buffer_store_dword v2, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: flat_store_dword v[1:2], v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 +; GFX10-WGP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 +; GFX10-CU-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-CU-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: .LBB14_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB14_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end +; SKIP-CACHE-INV-NEXT: s_or_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-TGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX940-NOTTGSPLIT-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX940-TGSPLIT-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-TGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 +; GFX11-WGP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2201,17 +3236,33 @@ ; GFX11-CU-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 +; GFX11-CU-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-CU-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2225,158 +3276,308 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB15_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: buffer_store_dword v2, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: flat_store_dword v[1:2], v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 +; GFX10-WGP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 +; GFX10-CU-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-CU-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: .LBB15_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB15_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end +; SKIP-CACHE-INV-NEXT: s_or_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-TGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX940-NOTTGSPLIT-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX940-TGSPLIT-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-TGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 +; GFX11-WGP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2384,17 +3585,33 @@ ; GFX11-CU-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 +; GFX11-CU-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-CU-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -7723,121 +8940,269 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX6-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB49_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 +; GFX10-WGP-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 +; GFX10-CU-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB49_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: .LBB49_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB49_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NOTTGSPLIT-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-TGSPLIT-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-WGP-NEXT: s_nop 0 -; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 +; GFX11-WGP-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-CU-NEXT: s_nop 0 -; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 +; GFX11-CU-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB49_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -7848,144 +9213,285 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX6-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB50_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB50_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 +; GFX10-WGP-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 +; GFX10-CU-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB50_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: .LBB50_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB50_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB50_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NOTTGSPLIT-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-TGSPLIT-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB50_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 +; GFX11-WGP-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 +; GFX11-CU-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB50_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -7996,140 +9502,288 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX6-LABEL: global_system_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB51_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB51_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 +; GFX10-WGP-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 +; GFX10-CU-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB51_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: .LBB51_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB51_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB51_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NOTTGSPLIT-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-TGSPLIT-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB51_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_release_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 +; GFX11-WGP-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-WGP-NEXT: s_nop 0 -; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_release_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 +; GFX11-CU-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-CU-NEXT: s_nop 0 -; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB51_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -8140,163 +9794,304 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB52_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB52_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 +; GFX10-WGP-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 +; GFX10-CU-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB52_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: .LBB52_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB52_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB52_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NOTTGSPLIT-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-TGSPLIT-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB52_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 +; GFX11-WGP-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 +; GFX11-CU-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB52_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -8307,163 +10102,304 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB53_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB53_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 +; GFX10-WGP-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 +; GFX10-CU-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB53_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: .LBB53_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB53_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NOTTGSPLIT-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-TGSPLIT-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-NOTTGSPLIT-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5 +; GFX940-TGSPLIT-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB53_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 +; GFX11-WGP-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 +; GFX11-CU-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] -; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc +; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB53_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -8474,141 +10410,291 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB54_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: buffer_store_dword v2, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB54_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: flat_store_dword v[1:2], v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-WGP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 +; GFX10-WGP-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-CU-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 +; GFX10-CU-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB54_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-CU-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: .LBB54_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB54_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end +; SKIP-CACHE-INV-NEXT: s_or_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB54_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-TGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX940-NOTTGSPLIT-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 -; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX940-TGSPLIT-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB54_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-TGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 +; GFX11-WGP-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v0 +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8616,15 +10702,31 @@ ; GFX11-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 +; GFX11-CU-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v0 +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB54_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-CU-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8638,158 +10740,308 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB55_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: buffer_store_dword v2, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB55_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: flat_store_dword v[1:2], v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 +; GFX10-WGP-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 +; GFX10-CU-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB55_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-CU-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: .LBB55_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB55_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end +; SKIP-CACHE-INV-NEXT: s_or_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-TGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX940-NOTTGSPLIT-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX940-TGSPLIT-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB55_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-TGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 +; GFX11-WGP-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8797,17 +11049,33 @@ ; GFX11-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 +; GFX11-CU-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB55_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-CU-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8821,158 +11089,308 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB56_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: buffer_store_dword v2, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b64 s[2:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[1:2] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; GFX7-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_cbranch_execnz .LBB56_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: flat_store_dword v[1:2], v0 ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-WGP-NEXT: s_mov_b32 s3, 0 +; GFX10-WGP-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX10-WGP-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-WGP-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-CU-NEXT: s_mov_b32 s3, 0 +; GFX10-CU-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX10-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX10-CU-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX10-CU-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: s_cbranch_execnz .LBB56_1 +; GFX10-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-CU-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[4:5], 0x0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: .LBB56_1: ; %atomicrmw.start +; SKIP-CACHE-INV-NEXT: ; =>This Inner Loop Header: Depth=1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SKIP-CACHE-INV-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2 +; SKIP-CACHE-INV-NEXT: s_andn2_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_cbranch_execnz .LBB56_1 +; SKIP-CACHE-INV-NEXT: ; %bb.2: ; %atomicrmw.end +; SKIP-CACHE-INV-NEXT: s_or_b64 exec, exec, s[0:1] +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NOTTGSPLIT-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-TGSPLIT-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-TGSPLIT-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-TGSPLIT-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NOTTGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX940-NOTTGSPLIT-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX940-NOTTGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-NOTTGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NOTTGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-TGSPLIT-NEXT: s_load_dword s5, s[2:3], 0x0 +; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 +; GFX940-TGSPLIT-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX940-TGSPLIT-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0 ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 +; GFX940-TGSPLIT-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX940-TGSPLIT-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-TGSPLIT-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: s_cbranch_execnz .LBB56_1 +; GFX940-TGSPLIT-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-TGSPLIT-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-WGP-NEXT: s_mov_b32 s1, 0 +; GFX11-WGP-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-WGP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[2:3] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX11-WGP-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-WGP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-WGP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-WGP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-WGP-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8980,17 +11398,33 @@ ; GFX11-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-CU-NEXT: s_mov_b32 s1, 0 +; GFX11-CU-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX11-CU-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc +; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[2:3] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 +; GFX11-CU-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-CU-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-CU-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: s_cbranch_execnz .LBB56_1 +; GFX11-CU-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-CU-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm Index: llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll =================================================================== --- llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll +++ llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll @@ -153,8 +153,17 @@ ; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] ; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[INV_MASK]], [[VALOPERAND_SHIFTED]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[ANDOPERAND]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; CHECK-NEXT: ret i16 [[EXTRACTED]] ; @@ -206,8 +215,17 @@ ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; CHECK-NEXT: ret i16 [[EXTRACTED]] ; @@ -226,8 +244,17 @@ ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; CHECK-NEXT: ret i16 [[EXTRACTED]] ; Index: llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll =================================================================== --- llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll +++ llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll @@ -162,8 +162,17 @@ ; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] ; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[INV_MASK]], [[VALOPERAND_SHIFTED]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[ANDOPERAND]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; CHECK-NEXT: ret i8 [[EXTRACTED]] ; @@ -215,8 +224,17 @@ ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; CHECK-NEXT: ret i8 [[EXTRACTED]] ; @@ -235,8 +253,17 @@ ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; CHECK-NEXT: ret i8 [[EXTRACTED]] ;