Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -14085,7 +14085,8 @@ SSID == SyncScope::System || SSID == RMW->getContext().getOrInsertSyncScopeID("one-as"); - switch (RMW->getOperation()) { + auto Op = RMW->getOperation(); + switch (Op) { case AtomicRMWInst::FAdd: { Type *Ty = RMW->getType(); @@ -14160,18 +14161,28 @@ return AtomicExpansionKind::CmpXChg; } - case AtomicRMWInst::FMin: - case AtomicRMWInst::FMax: - case AtomicRMWInst::Min: + case AtomicRMWInst::Xchg: + case AtomicRMWInst::Add: + // PCIe supports add and xchg for system atomics. + break; + case AtomicRMWInst::Sub: + case AtomicRMWInst::And: + case AtomicRMWInst::Or: + case AtomicRMWInst::Xor: case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: - case AtomicRMWInst::UMax: { + case AtomicRMWInst::FMin: + case AtomicRMWInst::FMax: + case AtomicRMWInst::UIncWrap: + case AtomicRMWInst::UDecWrap: { if (AMDGPU::isFlatGlobalAddrSpace(AS)) { - if (RMW->getType()->isFloatTy() && + if (AtomicRMWInst::isFPOperation(Op) && unsafeFPAtomicsDisabled(RMW->getFunction())) return AtomicExpansionKind::CmpXChg; - // Always expand system scope min/max atomics. + // Always expand system scope atomics. if (HasSystemScope) return AtomicExpansionKind::CmpXChg; } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -477,77 +477,172 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; CI-NEXT: s_mov_b64 s[2:3], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 16 -; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_load_dword s0, s[6:7], 0x4 +; CI-NEXT: s_add_u32 s6, s6, 16 +; CI-NEXT: s_addc_u32 s7, s7, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: .LBB6_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v1, v0 +; CI-NEXT: v_subrev_i32_e32 v0, vcc, 1, v1 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; CI-NEXT: v_mov_b32_e32 v2, s6 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s7 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; CI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; CI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; CI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; CI-NEXT: s_cbranch_execnz .LBB6_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[2:3] +; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: v_mov_b32_e32 v2, s5 +; CI-NEXT: flat_store_dword v[1:2], v0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 16 -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_load_dword s0, s[6:7], 0x10 +; VI-NEXT: s_add_u32 s6, s6, 16 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: .LBB6_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: s_cbranch_execnz .LBB6_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v2 +; GFX9-NEXT: v_subrev_u32_e32 v1, 1, v2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 42, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc +; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[6:7] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB6_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_cmp_lt_u32_e64 s0, 42, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 1, v2 +; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 42, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc +; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[6:7] offset:16 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: v_cmp_lt_u32_e64 s0, 42, v2 +; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 42, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[6:7] offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -698,67 +793,153 @@ ; CI-LABEL: global_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b64 s[2:3], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 16 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_load_dword s6, s[0:1], 0x4 +; CI-NEXT: s_add_u32 s4, s0, 16 +; CI-NEXT: s_addc_u32 s5, s1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s6 +; CI-NEXT: .LBB9_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_subrev_i32_e32 v0, vcc, 1, v1 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_dec v[0:1], v2 +; CI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; CI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; CI-NEXT: v_mov_b32_e32 v1, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; CI-NEXT: s_cbranch_execnz .LBB9_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_load_dword s6, s[0:1], 0x10 +; VI-NEXT: s_add_u32 s4, s0, 16 +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: .LBB9_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: s_cbranch_execnz .LBB9_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; GFX9-NEXT: v_subrev_u32_e32 v0, 1, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v1, v0, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s0, s[2:3], 0x10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_cmp_lt_u32_e64 s0, 42, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 1, v1 +; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_dec v1, v0, s[0:1] offset:16 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cmp_lt_u32_e64 s0, 42, v1 +; GFX11-NEXT: v_subrev_nc_u32_e32 v0, 1, v1 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] offset:16 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 @@ -1124,90 +1305,182 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 16 -; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: s_add_u32 s2, s6, 16 +; CI-NEXT: s_addc_u32 s3, s7, 0 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: s_mov_b64 s[6:7], 0 +; CI-NEXT: .LBB14_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; CI-NEXT: v_mov_b32_e32 v1, v0 +; CI-NEXT: v_subrev_i32_e32 v0, vcc, 1, v1 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; CI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; CI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; CI-NEXT: s_cbranch_execnz .LBB14_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[6:7] +; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: v_mov_b32_e32 v2, s5 +; CI-NEXT: flat_store_dword v[1:2], v0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 16 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: s_add_u32 s2, s6, 16 +; VI-NEXT: s_addc_u32 s3, s7, 0 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB14_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB14_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_subrev_u32_e32 v0, 1, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: flat_store_dword v[1:2], v0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s2, s2, 16 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_add_u32 s2, s6, 16 +; GFX10-NEXT: s_addc_u32 s3, s7, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: flat_load_dword v0, v[0:1] +; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_cmp_lt_u32_e64 s0, 42, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 1, v1 +; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:16 +; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s6 +; GFX11-NEXT: v_mov_b32_e32 v3, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cmp_lt_u32_e64 s0, 42, v1 +; GFX11-NEXT: v_subrev_nc_u32_e32 v0, 1, v1 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s5 +; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4 @@ -1374,77 +1647,162 @@ ; CI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b64 s[4:5], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 16 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_add_u32 s2, s0, 16 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dword v1, v[0:1] +; CI-NEXT: .LBB17_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_dec v[0:1], v2 +; CI-NEXT: v_subrev_i32_e32 v0, vcc, 1, v1 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; CI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CI-NEXT: v_mov_b32_e32 v1, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CI-NEXT: s_cbranch_execnz .LBB17_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: .LBB17_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB17_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec v[0:1], v2 offset:16 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], 42, v1 +; GFX9-NEXT: v_subrev_u32_e32 v0, 1, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s0, 16 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_add_u32 s2, s0, 16 +; GFX10-NEXT: s_addc_u32 s3, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: flat_load_dword v1, v[0:1] +; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_cmp_lt_u32_e64 s0, 42, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 1, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_dec v[0:1], v2 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_load_b32 v1, v[0:1] offset:16 +; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_cmp_lt_u32_e64 s0, 42, v1 +; GFX11-NEXT: v_subrev_nc_u32_e32 v0, 1, v1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:16 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4 @@ -2053,82 +2411,188 @@ ; CI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b64 s[4:5], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 32 +; CI-NEXT: s_add_u32 s2, s0, 32 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: s_add_u32 s0, s0, 36 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: flat_load_dword v2, v[0:1] +; CI-NEXT: flat_load_dword v3, v[3:4] +; CI-NEXT: .LBB24_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; CI-NEXT: v_subrev_i32_e32 v0, vcc, 1, v2 +; CI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc +; CI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; CI-NEXT: v_cmp_lt_u64_e64 s[0:1], 42, v[2:3] +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; CI-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CI-NEXT: v_mov_b32_e32 v3, v1 +; CI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CI-NEXT: s_cbranch_execnz .LBB24_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: s_add_u32 s0, s0, 36 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: flat_load_dword v3, v[3:4] +; VI-NEXT: .LBB24_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: v_subrev_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[0:1], 42, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB24_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[0:1], 42, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB24_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s0, 32 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_add_u32 s2, s0, 32 +; GFX10-NEXT: s_addc_u32 s3, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, 42, v[2:3] +; GFX10-NEXT: v_sub_co_u32 v0, s1, v2, 1 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s1, 0, v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] offset:32 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, 42, v[2:3] +; GFX11-NEXT: v_sub_co_u32 v0, s1, v2, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_subrev_co_ci_u32_e64 v1, s1, 0, v3, s1 +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8 @@ -2961,82 +3425,193 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_dec_ret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; CI-NEXT: s_mov_b64 s[8:9], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 32 -; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 +; CI-NEXT: s_add_u32 s6, s6, 32 +; CI-NEXT: s_addc_u32 s7, s7, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: .LBB34_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v3, v1 +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; CI-NEXT: v_cmp_lt_u64_e64 s[0:1], 42, v[2:3] +; CI-NEXT: v_subrev_i32_e64 v0, s[2:3], 1, v2 +; CI-NEXT: v_mov_b32_e32 v4, s6 +; CI-NEXT: v_subbrev_u32_e64 v1, s[2:3], 0, v3, s[2:3] +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v5, s7 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; CI-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; CI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; CI-NEXT: s_cbranch_execnz .LBB34_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[8:9] +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_ret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 32 -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x20 +; VI-NEXT: s_add_u32 s6, s6, 32 +; VI-NEXT: s_addc_u32 s7, s7, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: .LBB34_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[0:1], 42, v[2:3] +; VI-NEXT: v_subrev_u32_e64 v0, s[2:3], 1, v2 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_subbrev_u32_e64 v1, s[2:3], 0, v3, s[2:3] +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB34_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x20 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[5:6] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[0:1], 42, v[5:6] +; GFX9-NEXT: v_subrev_co_u32_e64 v0, s[2:3], 1, v5 +; GFX9-NEXT: v_subbrev_co_u32_e64 v1, s[2:3], 0, v6, s[2:3] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v0, 42, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, 0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[6:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB34_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x20 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[5:6] +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, 42, v[5:6] +; GFX10-NEXT: v_sub_co_u32 v0, s1, v5, 1 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s1, 0, v6, s1 +; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, 42, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[6:7] offset:32 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[5:6] +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x20 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[5:6] +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, 42, v[5:6] +; GFX11-NEXT: v_sub_co_u32 v0, s1, v5, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_subrev_co_ci_u32_e64 v1, s1, 0, v6, s1 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v0, 42, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v1, 0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[0:1], v2, v[3:6], s[6:7] offset:32 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[5:6] +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -3197,72 +3772,175 @@ ; CI-LABEL: global_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b64 s[2:3], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 32 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x8 +; CI-NEXT: s_add_u32 s4, s0, 32 +; CI-NEXT: s_addc_u32 s5, s1, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s6 +; CI-NEXT: v_mov_b32_e32 v3, s7 +; CI-NEXT: .LBB37_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_subrev_i32_e32 v0, vcc, 1, v2 +; CI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc +; CI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; CI-NEXT: v_cmp_lt_u64_e64 s[0:1], 42, v[2:3] +; CI-NEXT: v_mov_b32_e32 v4, s4 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v5, s5 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; CI-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; CI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CI-NEXT: v_mov_b32_e32 v3, v1 +; CI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; CI-NEXT: s_cbranch_execnz .LBB37_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 32 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20 +; VI-NEXT: s_add_u32 s4, s0, 32 +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: .LBB37_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[0:1], 42, v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: s_cbranch_execnz .LBB37_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[0:1], 42, v[2:3] +; GFX9-NEXT: v_subrev_co_u32_e64 v0, s[2:3], 1, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v1, s[2:3], 0, v3, s[2:3] +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 42, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB37_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x20 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, 42, v[2:3] +; GFX10-NEXT: v_sub_co_u32 v0, s1, v2, 1 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s1, 0, v3, s1 +; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] offset:32 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB37_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x20 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, 42, v[2:3] +; GFX11-NEXT: v_sub_co_u32 v0, s1, v2, 1 +; GFX11-NEXT: v_subrev_co_ci_u32_e64 v1, s1, 0, v3, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 42, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] offset:32 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -478,76 +478,160 @@ ; CI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b64 s[4:5], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dword s6, s[2:3], 0x4 ; CI-NEXT: s_add_u32 s2, s2, 16 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s6 +; CI-NEXT: .LBB6_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v1, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; CI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; CI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CI-NEXT: s_cbranch_execnz .LBB6_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[4:5] +; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: flat_store_dword v[1:2], v0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s6, s[2:3], 0x10 ; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: .LBB6_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB6_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_add_u32_e32 v1, 1, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 42, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc +; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[2:3] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB6_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc +; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[2:3] offset:16 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[2:3] offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -697,68 +781,143 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; CI-NEXT: s_mov_b64 s[0:1], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 16 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_load_dword s4, s[2:3], 0x4 +; CI-NEXT: s_add_u32 s2, s2, 16 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s4 +; CI-NEXT: .LBB9_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_inc v[0:1], v2 +; CI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v1, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; CI-NEXT: s_cbranch_execnz .LBB9_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_load_dword s4, s[2:3], 0x10 +; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: .LBB9_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_cbranch_execnz .LBB9_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_u32_e32 v0, 1, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v1, v0, s[0:1] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_inc v1, v0, s[0:1] offset:16 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v1 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 @@ -1528,17 +1687,35 @@ ; CI-LABEL: global_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b64 s[4:5], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x8 ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s6 +; CI-NEXT: v_mov_b32_e32 v1, s7 +; CI-NEXT: .LBB19_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v3, v1 +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; CI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; CI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CI-NEXT: s_cbranch_execnz .LBB19_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[4:5] ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1547,17 +1724,35 @@ ; VI-LABEL: global_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x20 ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: .LBB19_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB19_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1566,42 +1761,105 @@ ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x20 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, 42, v[5:6] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[2:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB19_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v5, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[5:6] +; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[2:3] offset:32 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[5:6] +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x20 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v5, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v6, vcc_lo +; GFX11-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[5:6] +; GFX11-NEXT: v_cndmask_b32_e64 v3, v0, 0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX11-NEXT: global_atomic_cmpswap_b64 v[0:1], v2, v[3:6], s[2:3] offset:32 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[5:6] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1762,73 +2020,163 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspace(1) %ptr) #1 { ; CI-LABEL: global_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; CI-NEXT: s_mov_b64 s[0:1], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 32 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; CI-NEXT: s_add_u32 s2, s2, 32 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: .LBB22_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; CI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; CI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; CI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CI-NEXT: v_mov_b32_e32 v3, v1 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; CI-NEXT: s_cbranch_execnz .LBB22_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 32 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20 +; VI-NEXT: s_add_u32 s2, s2, 32 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: .LBB22_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_cbranch_execnz .LBB22_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB22_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x20 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x20 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, 1 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; GFX11-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 @@ -2205,89 +2553,169 @@ ; CI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b64 s[4:5], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: .LBB27_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; CI-NEXT: v_mov_b32_e32 v1, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; CI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CI-NEXT: s_cbranch_execnz .LBB27_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[4:5] +; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: flat_store_dword v[1:2], v0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: .LBB27_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB27_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GFX9-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_add_u32_e32 v0, 1, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB27_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: flat_store_dword v[1:2], v0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: flat_load_dword v0, v[0:1] +; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; GFX10-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: flat_store_dword v[1:2], v0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:16 +; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_add_nc_u32 v0, 1, v1 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: flat_store_b32 v[1:2], v0 ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4 @@ -2454,77 +2882,151 @@ ; CI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b64 s[2:3], 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_load_dword v1, v[0:1] +; CI-NEXT: .LBB30_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_inc v[0:1], v2 +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; CI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; CI-NEXT: v_mov_b32_e32 v1, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; CI-NEXT: s_cbranch_execnz .LBB30_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: .LBB30_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: s_cbranch_execnz .LBB30_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc v[0:1], v2 offset:16 +; GFX9-NEXT: v_add_u32_e32 v0, 1, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB30_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 +; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: flat_load_dword v1, v[0:1] +; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_inc v[0:1], v2 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_b32 v1, v[0:1] offset:16 +; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_add_nc_u32 v0, 1, v1 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], v[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4 @@ -3066,63 +3568,127 @@ ; CI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 32 +; CI-NEXT: s_add_u32 s4, s2, 32 +; CI-NEXT: s_addc_u32 s5, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: s_add_u32 s2, s2, 36 +; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: flat_load_dword v1, v[2:3] +; CI-NEXT: s_mov_b64 s[2:3], 0 +; CI-NEXT: .LBB36_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: v_mov_b32_e32 v3, v1 +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; CI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; CI-NEXT: v_mov_b32_e32 v4, s4 +; CI-NEXT: v_mov_b32_e32 v5, s5 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; CI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; CI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; CI-NEXT: s_cbranch_execnz .LBB36_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[2:3] ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_add_u32 s0, s0, 4 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: flat_store_dword v[2:3], v0 -; CI-NEXT: flat_store_dword v[4:5], v1 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dword v[2:3], v1 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 32 +; VI-NEXT: s_add_u32 s4, s2, 32 +; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_add_u32 s2, s2, 36 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: s_mov_b64 s[2:3], 0 +; VI-NEXT: .LBB36_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: s_cbranch_execnz .LBB36_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[2:3] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_add_u32 s0, s0, 4 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dword v[2:3], v0 -; VI-NEXT: flat_store_dword v[4:5], v1 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[2:3], v1 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB36_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3131,19 +3697,37 @@ ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 32 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB36_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3152,16 +3736,36 @@ ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, 1 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; GFX11-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm @@ -3339,83 +3943,177 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 32 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_add_u32 s0, s2, 32 +; CI-NEXT: s_addc_u32 s1, s3, 0 +; CI-NEXT: s_add_u32 s2, s2, 36 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v4, s3 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v3, s2 +; CI-NEXT: flat_load_dword v2, v[0:1] +; CI-NEXT: flat_load_dword v3, v[3:4] +; CI-NEXT: s_mov_b64 s[2:3], 0 +; CI-NEXT: .LBB39_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; CI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; CI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CI-NEXT: v_mov_b32_e32 v3, v1 +; CI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; CI-NEXT: s_cbranch_execnz .LBB39_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 32 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_u32 s0, s2, 32 +; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_add_u32 s2, s2, 36 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: flat_load_dword v3, v[3:4] +; VI-NEXT: s_mov_b64 s[2:3], 0 +; VI-NEXT: .LBB39_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: s_cbranch_execnz .LBB39_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB39_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 32 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v5, s1 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz .LBB39_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] offset:32 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, 1 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; GFX11-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8 Index: llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -911,25 +911,67 @@ ; GCN1-LABEL: flat_atomic_sub_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB24_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB24_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB24_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB24_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB24_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB24_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr %ptr, i32 %in seq_cst ret void @@ -941,10 +983,23 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB25_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB25_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_noret_offset: @@ -952,18 +1007,45 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB25_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v[0:1], v2 +; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB25_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB25_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB25_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst @@ -974,25 +1056,70 @@ ; GCN1-LABEL: flat_atomic_sub_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB26_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB26_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB26_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB26_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB26_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB26_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i32 %in seq_cst ret i32 %result @@ -1002,31 +1129,72 @@ ; GCN1-LABEL: flat_atomic_sub_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB27_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB27_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB27_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v1, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB27_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB27_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB27_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw sub ptr %gep, i32 %in seq_cst @@ -1039,11 +1207,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB28_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB28_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_noret_scalar: @@ -1051,11 +1233,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB28_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB28_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_noret_scalar: @@ -1063,11 +1259,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB28_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_subrev_u32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v[0:1], v2 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB28_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr %ptr, i32 %in seq_cst ret void @@ -1081,11 +1291,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB29_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB29_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_noret_offset_scalar: @@ -1095,11 +1319,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB29_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB29_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_noret_offset_scalar: @@ -1107,11 +1345,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB29_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_subrev_u32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB29_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst @@ -1124,11 +1376,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB30_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_ret_scalar: @@ -1136,11 +1402,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB30_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_ret_scalar: @@ -1148,11 +1428,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_subrev_u32_e32 v0, s6, v1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB30_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i32 %in seq_cst ret i32 %result @@ -1166,11 +1460,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB31_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_ret_offset_scalar: @@ -1180,11 +1488,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB31_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_ret_offset_scalar: @@ -1192,11 +1514,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_subrev_u32_e32 v0, s6, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB31_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw sub ptr %gep, i32 %in seq_cst @@ -1211,25 +1547,67 @@ ; GCN1-LABEL: flat_atomic_and_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB32_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB32_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB32_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i32 %in seq_cst ret void @@ -1241,10 +1619,23 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB33_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_noret_offset: @@ -1252,18 +1643,45 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB33_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB33_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst @@ -1274,25 +1692,70 @@ ; GCN1-LABEL: flat_atomic_and_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB34_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB34_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB34_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i32 %in seq_cst ret i32 %result @@ -1302,31 +1765,72 @@ ; GCN1-LABEL: flat_atomic_and_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_and_b32_e32 v0, v1, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB35_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_and_b32_e32 v0, v1, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB35_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB35_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw and ptr %gep, i32 %in seq_cst @@ -1339,11 +1843,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB36_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_noret_scalar: @@ -1351,11 +1869,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v[0:1], v2 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB36_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_noret_scalar: @@ -1363,11 +1895,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v[0:1], v2 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB36_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i32 %in seq_cst ret void @@ -1381,11 +1927,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB37_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_noret_offset_scalar: @@ -1395,11 +1955,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v[0:1], v2 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB37_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_noret_offset_scalar: @@ -1407,11 +1981,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB37_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst @@ -1424,11 +2012,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB38_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB38_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_ret_scalar: @@ -1436,11 +2038,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB38_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB38_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_ret_scalar: @@ -1448,11 +2064,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB38_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB38_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i32 %in seq_cst ret i32 %result @@ -1466,11 +2096,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB39_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB39_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_ret_offset_scalar: @@ -1480,11 +2124,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB39_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB39_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_ret_offset_scalar: @@ -1492,11 +2150,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB39_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB39_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw and ptr %gep, i32 %in seq_cst @@ -2171,25 +2843,67 @@ ; GCN1-LABEL: flat_atomic_or_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB48_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB48_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB48_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB48_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB48_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB48_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i32 %in seq_cst ret void @@ -2201,10 +2915,23 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB49_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v4, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB49_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_noret_offset: @@ -2212,18 +2939,45 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB49_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v[0:1], v2 +; GCN2-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB49_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB49_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB49_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst @@ -2234,25 +2988,70 @@ ; GCN1-LABEL: flat_atomic_or_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB50_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB50_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB50_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i32 %in seq_cst ret i32 %result @@ -2262,31 +3061,72 @@ ; GCN1-LABEL: flat_atomic_or_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB51_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_or_b32_e32 v0, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB51_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB51_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_or_b32_e32 v0, v1, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB51_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB51_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB51_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw or ptr %gep, i32 %in seq_cst @@ -2299,11 +3139,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB52_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_noret_scalar: @@ -2311,11 +3165,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB52_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_noret_scalar: @@ -2323,11 +3191,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v[0:1], v2 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB52_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i32 %in seq_cst ret void @@ -2341,11 +3223,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB53_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB53_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_noret_offset_scalar: @@ -2355,11 +3251,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB53_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB53_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_noret_offset_scalar: @@ -2367,11 +3277,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB53_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB53_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst @@ -2384,11 +3308,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB54_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB54_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_ret_scalar: @@ -2396,11 +3334,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB54_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB54_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_ret_scalar: @@ -2408,11 +3360,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB54_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB54_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i32 %in seq_cst ret i32 %result @@ -2426,11 +3392,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB55_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_ret_offset_scalar: @@ -2440,11 +3420,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB55_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_ret_offset_scalar: @@ -2452,11 +3446,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB55_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw or ptr %gep, i32 %in seq_cst @@ -2471,25 +3479,67 @@ ; GCN1-LABEL: flat_atomic_xor_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB56_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB56_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB56_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst ret void @@ -2501,10 +3551,23 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB57_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v4, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB57_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_noret_offset: @@ -2512,18 +3575,45 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB57_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB57_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB57_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst @@ -2534,25 +3624,70 @@ ; GCN1-LABEL: flat_atomic_xor_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB58_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB58_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB58_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i32 %in seq_cst ret i32 %result @@ -2562,31 +3697,72 @@ ; GCN1-LABEL: flat_atomic_xor_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_xor_b32_e32 v0, v1, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB59_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_xor_b32_e32 v0, v1, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB59_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB59_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw xor ptr %gep, i32 %in seq_cst @@ -2599,11 +3775,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB60_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_noret_scalar: @@ -2611,11 +3801,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB60_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_noret_scalar: @@ -2623,11 +3827,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB60_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst ret void @@ -2641,11 +3859,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB61_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_noret_offset_scalar: @@ -2655,11 +3887,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB61_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_noret_offset_scalar: @@ -2667,11 +3913,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB61_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst @@ -2684,11 +3944,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB62_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_ret_scalar: @@ -2696,11 +3970,25 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB62_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_ret_scalar: @@ -2708,11 +3996,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB62_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i32 %in seq_cst ret i32 %result @@ -2726,11 +4028,25 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB63_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_ret_offset_scalar: @@ -2740,11 +4056,25 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB63_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_ret_offset_scalar: @@ -2752,11 +4082,25 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB63_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw xor ptr %gep, i32 %in seq_cst @@ -6517,25 +7861,73 @@ ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB107_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB107_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 +; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB107_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst ret void @@ -6547,10 +7939,25 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB108_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_offset: @@ -6558,18 +7965,49 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v[0:1], v2 +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB108_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 +; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB108_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst @@ -6580,25 +8018,76 @@ ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB109_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB109_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 +; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB109_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst ret i32 %result @@ -6608,31 +8097,78 @@ ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB110_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB110_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 +; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB110_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst @@ -6645,11 +8181,27 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v[0:1], v2 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB111_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar: @@ -6657,11 +8209,27 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB111_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar: @@ -6669,11 +8237,27 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v[0:1], v2 +; GCN3-NEXT: v_add_u32_e32 v0, 1, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB111_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst ret void @@ -6687,11 +8271,27 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB112_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar: @@ -6701,11 +8301,27 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v[0:1], v2 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB112_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar: @@ -6713,11 +8329,27 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_u32_e32 v0, 1, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB112_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst @@ -6730,11 +8362,27 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB113_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar: @@ -6742,11 +8390,27 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB113_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar: @@ -6754,11 +8418,27 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_add_u32_e32 v0, 1, v1 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB113_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst ret i32 %result @@ -6772,11 +8452,27 @@ ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB114_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar: @@ -6786,11 +8482,27 @@ ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB114_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar: @@ -6798,11 +8510,27 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_add_u32_e32 v0, 1, v1 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB114_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst @@ -6817,25 +8545,79 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB115_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB115_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB115_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst ret void @@ -6847,10 +8629,27 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB116_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_offset: @@ -6858,18 +8657,53 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v[0:1], v2 +; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB116_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB116_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst @@ -6880,25 +8714,82 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB117_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB117_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB117_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst ret i32 %result @@ -6908,31 +8799,84 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB118_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB118_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB118_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst @@ -6945,11 +8889,30 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v[0:1], v2 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB119_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_scalar: @@ -6957,11 +8920,30 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB119_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_scalar: @@ -6969,11 +8951,30 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN3-NEXT: v_add_u32_e32 v0, -1, v1 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v[0:1], v2 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB119_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst ret void @@ -6983,29 +8984,67 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 16 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: s_add_u32 s36, s4, 16 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: s_mov_b64 s[38:39], 0 +; GCN1-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s36 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, s37 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v[0:1], v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_cbranch_execnz .LBB120_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 16 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: s_add_u32 s36, s4, 16 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: s_mov_b64 s[38:39], 0 +; GCN2-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s36 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, s37 +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v[0:1], v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_cbranch_execnz .LBB120_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar: @@ -7013,11 +9052,30 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v1, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN3-NEXT: v_add_u32_e32 v0, -1, v1 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB120_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst @@ -7030,11 +9088,30 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, -1, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB121_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_scalar: @@ -7042,11 +9119,30 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, -1, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB121_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_scalar: @@ -7054,11 +9150,30 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_add_u32_e32 v4, -1, v1 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB121_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst ret i32 %result @@ -7068,29 +9183,67 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 16 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: s_add_u32 s36, s4, 16 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: s_mov_b64 s[38:39], 0 +; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, -1, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s36 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, s37 +; GCN1-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_cbranch_execnz .LBB122_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 16 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: s_add_u32 s36, s4, 16 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: s_mov_b64 s[38:39], 0 +; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, -1, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s36 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, s37 +; GCN2-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_cbranch_execnz .LBB122_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar: @@ -7098,11 +9251,30 @@ ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v1, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_add_u32_e32 v4, -1, v1 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB122_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst Index: llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -947,25 +947,79 @@ ; GCN1-LABEL: flat_atomic_sub_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB24_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB24_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB24_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB24_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB24_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB24_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst ret void @@ -975,31 +1029,83 @@ ; GCN1-LABEL: flat_atomic_sub_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB25_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB25_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB25_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB25_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB25_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB25_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst @@ -1010,25 +1116,85 @@ ; GCN1-LABEL: flat_atomic_sub_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: flat_load_dword v5, v[5:6] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB26_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB26_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: flat_load_dword v5, v[5:6] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB26_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB26_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB26_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB26_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i64 %in seq_cst ret i64 %result @@ -1038,31 +1204,85 @@ ; GCN1-LABEL: flat_atomic_sub_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB27_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_sub_i32_e32 v6, vcc, v8, v2 +; GCN1-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB27_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB27_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_sub_u32_e32 v6, vcc, v8, v2 +; GCN2-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB27_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB27_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB27_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst @@ -1073,44 +1293,102 @@ ; GCN1-LABEL: flat_atomic_sub_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: v_mov_b32_e32 v4, s35 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB28_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB28_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: v_mov_b32_e32 v4, s35 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB28_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB28_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB28_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_setpc_b64 s[30:31] - %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst - ret void -} +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB28_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_setpc_b64 s[30:31] + %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst + ret void +} define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) { ; GCN1-LABEL: flat_atomic_sub_i64_noret_offset_scalar: @@ -1118,14 +1396,35 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB29_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB29_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_noret_offset_scalar: @@ -1133,27 +1432,64 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB29_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB29_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB29_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB29_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst @@ -1164,40 +1500,98 @@ ; GCN1-LABEL: flat_atomic_sub_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 ; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB30_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 ; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB30_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB30_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i64 %in seq_cst ret i64 %result @@ -1209,14 +1603,35 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB31_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i64_ret_offset_scalar: @@ -1224,27 +1639,64 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB31_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 ; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB31_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst @@ -1259,25 +1711,79 @@ ; GCN1-LABEL: flat_atomic_and_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB32_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB32_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB32_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst ret void @@ -1287,31 +1793,83 @@ ; GCN1-LABEL: flat_atomic_and_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB33_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_and_b32_e32 v4, v6, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB33_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB33_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst @@ -1322,25 +1880,85 @@ ; GCN1-LABEL: flat_atomic_and_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: flat_load_dword v5, v[5:6] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB34_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: flat_load_dword v5, v[5:6] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB34_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB34_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i64 %in seq_cst ret i64 %result @@ -1350,31 +1968,85 @@ ; GCN1-LABEL: flat_atomic_and_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_and_b32_e32 v7, v9, v3 +; GCN1-NEXT: v_and_b32_e32 v6, v8, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB35_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_and_b32_e32 v7, v9, v3 +; GCN2-NEXT: v_and_b32_e32 v6, v8, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB35_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB35_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst @@ -1385,40 +2057,95 @@ ; GCN1-LABEL: flat_atomic_and_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: v_mov_b32_e32 v4, s35 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB36_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: v_mov_b32_e32 v4, s35 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB36_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN3-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB36_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst ret void @@ -1430,14 +2157,34 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB37_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_noret_offset_scalar: @@ -1445,27 +2192,62 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB37_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB37_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst @@ -1476,40 +2258,95 @@ ; GCN1-LABEL: flat_atomic_and_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB38_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB38_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB38_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB38_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB38_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB38_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i64 %in seq_cst ret i64 %result @@ -1521,14 +2358,34 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB39_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB39_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i64_ret_offset_scalar: @@ -1536,27 +2393,62 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB39_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB39_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB39_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB39_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst @@ -2371,25 +3263,79 @@ ; GCN1-LABEL: flat_atomic_or_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB48_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB48_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB48_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB48_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB48_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB48_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst ret void @@ -2399,31 +3345,83 @@ ; GCN1-LABEL: flat_atomic_or_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB49_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_or_b32_e32 v4, v6, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB49_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB49_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB49_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB49_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB49_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst @@ -2434,25 +3432,85 @@ ; GCN1-LABEL: flat_atomic_or_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: flat_load_dword v5, v[5:6] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB50_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: flat_load_dword v5, v[5:6] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB50_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB50_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB50_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i64 %in seq_cst ret i64 %result @@ -2462,31 +3520,85 @@ ; GCN1-LABEL: flat_atomic_or_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB51_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_or_b32_e32 v7, v9, v3 +; GCN1-NEXT: v_or_b32_e32 v6, v8, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB51_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB51_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_or_b32_e32 v7, v9, v3 +; GCN2-NEXT: v_or_b32_e32 v6, v8, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB51_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB51_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB51_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst @@ -2497,40 +3609,95 @@ ; GCN1-LABEL: flat_atomic_or_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: v_mov_b32_e32 v4, s35 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB52_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: v_mov_b32_e32 v4, s35 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB52_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB52_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB52_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst ret void @@ -2542,14 +3709,34 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB53_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB53_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_noret_offset_scalar: @@ -2557,27 +3744,62 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB53_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GCN2-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB53_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB53_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB53_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst @@ -2588,40 +3810,95 @@ ; GCN1-LABEL: flat_atomic_or_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB54_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB54_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB54_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB54_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB54_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB54_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i64 %in seq_cst ret i64 %result @@ -2633,14 +3910,34 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB55_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB55_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i64_ret_offset_scalar: @@ -2648,27 +3945,62 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB55_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB55_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB55_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB55_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst @@ -2683,25 +4015,79 @@ ; GCN1-LABEL: flat_atomic_xor_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB56_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB56_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB56_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB56_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB56_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB56_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst ret void @@ -2711,31 +4097,83 @@ ; GCN1-LABEL: flat_atomic_xor_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB57_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_xor_b32_e32 v4, v6, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB57_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB57_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB57_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB57_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB57_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst @@ -2746,25 +4184,85 @@ ; GCN1-LABEL: flat_atomic_xor_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: flat_load_dword v5, v[5:6] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB58_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: flat_load_dword v5, v[5:6] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB58_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB58_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB58_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i64 %in seq_cst ret i64 %result @@ -2774,31 +4272,85 @@ ; GCN1-LABEL: flat_atomic_xor_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_xor_b32_e32 v7, v9, v3 +; GCN1-NEXT: v_xor_b32_e32 v6, v8, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB59_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_xor_b32_e32 v7, v9, v3 +; GCN2-NEXT: v_xor_b32_e32 v6, v8, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB59_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB59_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB59_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst @@ -2809,40 +4361,95 @@ ; GCN1-LABEL: flat_atomic_xor_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: v_mov_b32_e32 v4, s35 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB60_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: v_mov_b32_e32 v4, s35 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB60_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB60_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst ret void @@ -2854,14 +4461,34 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB61_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_noret_offset_scalar: @@ -2869,27 +4496,62 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB61_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB61_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst @@ -2900,40 +4562,95 @@ ; GCN1-LABEL: flat_atomic_xor_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB62_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB62_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB62_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i64 %in seq_cst ret i64 %result @@ -2945,14 +4662,34 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB63_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i64_ret_offset_scalar: @@ -2960,27 +4697,62 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB63_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v2 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB63_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst @@ -7493,25 +9265,88 @@ ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB107_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB107_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB107_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB107_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst ret void @@ -7521,31 +9356,92 @@ ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v6 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB108_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v6 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB108_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB108_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB108_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst @@ -7556,25 +9452,94 @@ ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: flat_load_dword v5, v[5:6] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB109_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: flat_load_dword v5, v[5:6] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB109_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB109_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB109_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst ret i64 %result @@ -7584,31 +9549,94 @@ ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v8 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB110_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v8 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB110_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB110_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB110_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst @@ -7619,40 +9647,104 @@ ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: v_mov_b32_e32 v4, s35 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB111_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: v_mov_b32_e32 v4, s35 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB111_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB111_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB111_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst ret void @@ -7664,14 +9756,37 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB112_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: @@ -7679,27 +9794,68 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB112_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB112_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB112_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst @@ -7710,40 +9866,104 @@ ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB113_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB113_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB113_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB113_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst ret i64 %result @@ -7755,14 +9975,37 @@ ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 32 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_add_u32 s36, s4, 36 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 ; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB114_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: @@ -7770,27 +10013,68 @@ ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 32 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_add_u32 s36, s4, 36 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 ; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB114_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB114_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB114_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst @@ -7805,25 +10089,94 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: s_mov_b64 s[8:9], 0 +; GCN1-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6 +; GCN1-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_cbranch_execnz .LBB115_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: s_mov_b64 s[8:9], 0 +; GCN2-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; GCN2-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB115_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[8:9], 0 +; GCN3-NEXT: .LBB115_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GCN3-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_cbranch_execnz .LBB115_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst ret void @@ -7833,31 +10186,98 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[0:1] +; GCN1-NEXT: flat_load_dword v6, v[8:9] +; GCN1-NEXT: s_mov_b64 s[8:9], 0 +; GCN1-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v6 +; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v7, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_cbranch_execnz .LBB116_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[0:1] +; GCN2-NEXT: flat_load_dword v6, v[8:9] +; GCN2-NEXT: s_mov_b64 s[8:9], 0 +; GCN2-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v6 +; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v7, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB116_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] offset:32 +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[8:9], 0 +; GCN3-NEXT: .LBB116_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GCN3-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_cbranch_execnz .LBB116_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst @@ -7868,25 +10288,100 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: flat_load_dword v5, v[5:6] +; GCN1-NEXT: s_mov_b64 s[8:9], 0 +; GCN1-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6 +; GCN1-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_cbranch_execnz .LBB117_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: flat_load_dword v5, v[5:6] +; GCN2-NEXT: s_mov_b64 s[8:9], 0 +; GCN2-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; GCN2-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB117_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GCN3-NEXT: s_mov_b64 s[8:9], 0 +; GCN3-NEXT: .LBB117_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GCN3-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_cbranch_execnz .LBB117_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst ret i64 %result @@ -7896,31 +10391,100 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[8:9], 0 +; GCN1-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v8 +; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_cbranch_execnz .LBB118_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[8:9], 0 +; GCN2-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v8 +; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB118_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GCN3-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[8:9], 0 +; GCN3-NEXT: .LBB118_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GCN3-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_cbranch_execnz .LBB118_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: v_mov_b32_e32 v0, v4 +; GCN3-NEXT: v_mov_b32_e32 v1, v5 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst @@ -7931,40 +10495,116 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s34 +; GCN1-NEXT: v_mov_b32_e32 v4, s35 +; GCN1-NEXT: flat_load_dword v2, v[0:1] +; GCN1-NEXT: flat_load_dword v3, v[3:4] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB119_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s34 +; GCN2-NEXT: v_mov_b32_e32 v4, s35 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: flat_load_dword v3, v[3:4] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB119_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB119_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, -1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB119_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst ret void @@ -7974,44 +10614,120 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 32 +; GCN1-NEXT: s_add_u32 s36, s4, 32 +; GCN1-NEXT: s_addc_u32 s37, s5, 0 +; GCN1-NEXT: s_add_u32 s34, s4, 36 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v4, s36 +; GCN1-NEXT: v_mov_b32_e32 v5, s37 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[38:39], 0 +; GCN1-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s36 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v5, s37 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_cbranch_execnz .LBB120_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 32 +; GCN2-NEXT: s_add_u32 s36, s4, 32 +; GCN2-NEXT: s_addc_u32 s37, s5, 0 +; GCN2-NEXT: s_add_u32 s34, s4, 36 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v4, s36 +; GCN2-NEXT: v_mov_b32_e32 v5, s37 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[38:39], 0 +; GCN2-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s36 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v5, s37 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_cbranch_execnz .LBB120_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB120_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, -1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] offset:32 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB120_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst @@ -8022,40 +10738,116 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: flat_load_dword v1, v[2:3] +; GCN1-NEXT: s_mov_b64 s[38:39], 0 +; GCN1-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v7, s[36:37], -1, v2 +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_cbranch_execnz .LBB121_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: flat_load_dword v1, v[2:3] +; GCN2-NEXT: s_mov_b64 s[38:39], 0 +; GCN2-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v7, s[36:37], -1, v2 +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_cbranch_execnz .LBB121_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN3-NEXT: s_mov_b64 s[38:39], 0 +; GCN3-NEXT: .LBB121_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v7, s[36:37], -1, v2 +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN3-NEXT: s_cbranch_execnz .LBB121_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst ret i64 %result @@ -8065,44 +10857,120 @@ ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 32 +; GCN1-NEXT: s_add_u32 s38, s4, 32 +; GCN1-NEXT: s_addc_u32 s39, s5, 0 +; GCN1-NEXT: s_add_u32 s34, s4, 36 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s7 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s38 +; GCN1-NEXT: v_mov_b32_e32 v3, s39 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[40:41], 0 +; GCN1-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v7, s[36:37], -1, v2 +; GCN1-NEXT: v_mov_b32_e32 v0, s7 +; GCN1-NEXT: v_mov_b32_e32 v6, s6 +; GCN1-NEXT: v_mov_b32_e32 v4, s38 +; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v5, s39 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN1-NEXT: s_cbranch_execnz .LBB122_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[40:41] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 32 +; GCN2-NEXT: s_add_u32 s38, s4, 32 +; GCN2-NEXT: s_addc_u32 s39, s5, 0 +; GCN2-NEXT: s_add_u32 s34, s4, 36 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s7 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s38 +; GCN2-NEXT: v_mov_b32_e32 v3, s39 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[40:41], 0 +; GCN2-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v7, s[36:37], -1, v2 +; GCN2-NEXT: v_mov_b32_e32 v0, s7 +; GCN2-NEXT: v_mov_b32_e32 v6, s6 +; GCN2-NEXT: v_mov_b32_e32 v4, s38 +; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v5, s39 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN2-NEXT: s_cbranch_execnz .LBB122_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[40:41] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s7 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 +; GCN3-NEXT: s_mov_b64 s[38:39], 0 +; GCN3-NEXT: .LBB122_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v7, s[36:37], -1, v2 +; GCN3-NEXT: v_mov_b32_e32 v0, s7 +; GCN3-NEXT: v_mov_b32_e32 v6, s6 +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN3-NEXT: s_cbranch_execnz .LBB122_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[38:39] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst Index: llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -1671,27 +1671,71 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB24_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB24_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB24_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB24_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB24_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -1705,10 +1749,26 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB25_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB25_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1717,18 +1777,45 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB25_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB25_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB25_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst @@ -1743,28 +1830,74 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB26_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB26_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB26_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB26_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB26_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -1778,31 +1911,75 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB27_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB27_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB27_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_sub_u32_e32 v0, vcc, v1, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB27_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB27_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst @@ -1822,11 +1999,26 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB28_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_subrev_i32_e32 v1, vcc, s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB28_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1840,22 +2032,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB28_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB28_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v0, v1, s[4:5] +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB28_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -1874,11 +2092,26 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB29_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_subrev_i32_e32 v1, vcc, s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB29_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1894,22 +2127,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB29_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB29_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v0, v1, s[4:5] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB29_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst @@ -1929,11 +2188,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB30_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_subrev_i32_e32 v3, vcc, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB30_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1947,22 +2222,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB30_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB30_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB30_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -1981,11 +2282,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB31_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_subrev_i32_e32 v3, vcc, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB31_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2001,22 +2318,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB31_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB31_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB31_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst @@ -2035,27 +2378,71 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB32_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB32_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB32_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v3, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB32_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB32_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -2069,10 +2456,26 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB33_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB33_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2081,18 +2484,45 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB33_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v3, v4, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB33_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB33_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst @@ -2107,28 +2537,74 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB34_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB34_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB34_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_and_b32_e32 v3, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB34_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB34_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -2142,31 +2618,75 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB35_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB35_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB35_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_and_b32_e32 v0, v1, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB35_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB35_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst @@ -2186,11 +2706,26 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB36_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB36_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2204,22 +2739,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB36_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB36_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v0, v1, s[4:5] +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB36_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -2238,11 +2799,26 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB37_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB37_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2258,22 +2834,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB37_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB37_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v0, v1, s[4:5] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB37_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst @@ -2293,11 +2895,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB38_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB38_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2311,22 +2929,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB38_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_and_b32_e32 v0, s6, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB38_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_and_b32_e32 v2, s6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v0, v0, v1, s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB38_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -2345,11 +2989,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB39_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB39_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2365,22 +3025,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB39_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_and_b32_e32 v0, s6, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB39_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_and_b32_e32 v2, s6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB39_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst @@ -3130,27 +3816,71 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB48_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB48_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB48_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v3, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB48_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB48_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -3164,10 +3894,26 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB49_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB49_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3176,18 +3922,45 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB49_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v3, v4, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB49_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB49_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst @@ -3202,28 +3975,74 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB50_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB50_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB50_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_or_b32_e32 v3, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB50_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB50_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -3237,31 +4056,75 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB51_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB51_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB51_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_or_b32_e32 v0, v1, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB51_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB51_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst @@ -3281,11 +4144,26 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB52_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB52_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3299,22 +4177,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB52_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB52_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v0, s6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v0, v1, s[4:5] +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB52_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -3333,11 +4237,26 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB53_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB53_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3353,22 +4272,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB53_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB53_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v0, s6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v0, v1, s[4:5] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB53_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst @@ -3388,11 +4333,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB54_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v3, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB54_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3406,22 +4367,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB54_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_or_b32_e32 v0, s6, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB54_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_or_b32_e32 v2, s6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v0, v0, v1, s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB54_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -3440,11 +4427,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB55_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v3, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB55_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3460,22 +4463,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB55_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_or_b32_e32 v0, s6, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB55_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_or_b32_e32 v2, s6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB55_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst @@ -3494,27 +4523,71 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB56_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB56_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB56_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v3, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB56_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB56_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -3528,10 +4601,26 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB57_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB57_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3540,18 +4629,45 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB57_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v3, v4, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB57_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB57_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst @@ -3566,28 +4682,74 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB58_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB58_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB58_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_xor_b32_e32 v3, v4, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB58_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB58_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -3601,31 +4763,75 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB59_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB59_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB59_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_xor_b32_e32 v0, v1, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB59_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB59_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst @@ -3645,11 +4851,26 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB60_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v1, s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB60_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3663,22 +4884,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB60_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB60_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v0, v1, s[4:5] +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB60_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -3697,11 +4944,26 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB61_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v1, s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB61_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3717,22 +4979,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB61_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v0, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB61_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v0, v1, s[4:5] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB61_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst @@ -3752,11 +5040,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB62_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v3, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB62_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3770,22 +5074,48 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB62_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_xor_b32_e32 v0, s6, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB62_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, s6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB62_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -3804,11 +5134,27 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB63_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v3, s34, v4 +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB63_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3824,22 +5170,48 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB63_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_xor_b32_e32 v0, s6, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB63_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, s6, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB63_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst @@ -7871,27 +9243,77 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB107_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; SI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB107_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB107_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB107_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB107_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -7905,10 +9327,28 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB108_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; SI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB108_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -7917,18 +9357,49 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB108_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB108_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB108_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -7943,28 +9414,80 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB109_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; SI-NEXT: v_cmp_lt_u32_e32 vcc, v5, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB109_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB109_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB109_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB109_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -7978,31 +9501,81 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB110_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; SI-NEXT: v_cmp_lt_u32_e32 vcc, v5, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB110_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB110_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB110_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB110_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB110_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8022,11 +9595,28 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB111_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v2 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v2 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB111_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8040,22 +9630,52 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB111_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB111_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 1, v1 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v1, s[4:5] +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB111_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -8074,11 +9694,28 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB112_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v2 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v2 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB112_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8094,22 +9731,52 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB112_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB112_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 1, v1 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v1, s[4:5] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB112_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8129,11 +9796,29 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB113_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v4 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB113_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8147,22 +9832,52 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB113_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB113_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 1, v3 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB113_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -8181,11 +9896,29 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB114_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v4 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB114_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8201,22 +9934,52 @@ ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB114_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_mov_b32_e32 v2, s34 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB114_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 1, v3 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB114_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8231,31 +9994,87 @@ ; SI-LABEL: global_atomic_udec_wrap_i32_noret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB115_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB115_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB115_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB115_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: .LBB115_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB115_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -8265,14 +10084,34 @@ ; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB116_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[8:11], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB116_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8281,18 +10120,53 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB116_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB116_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: .LBB116_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB116_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8303,32 +10177,90 @@ ; SI-LABEL: global_atomic_udec_wrap_i32_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB117_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v5, v2 +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB117_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB117_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB117_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB117_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -8338,35 +10270,91 @@ ; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB118_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v5, v2 +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[8:11], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB118_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB118_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2 +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB118_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB118_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8386,11 +10374,31 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: .LBB119_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, -1, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s34 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec v1, off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB119_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8404,22 +10412,58 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB119_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB119_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GFX9-NEXT: v_add_u32_e32 v0, -1, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v1, s[4:5] +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB119_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -8438,11 +10482,31 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: .LBB120_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, -1, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s34 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v3, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec v1, off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB120_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8454,26 +10518,62 @@ ; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_add_u32 s34, s4, 16 -; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_add_u32 s36, s4, 16 +; VI-NEXT: s_addc_u32 s37, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s36 +; VI-NEXT: v_mov_b32_e32 v1, s37 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: s_mov_b64 s[38:39], 0 +; VI-NEXT: .LBB120_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v2, s36 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, s37 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; VI-NEXT: s_cbranch_execnz .LBB120_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[38:39] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GFX9-NEXT: v_add_u32_e32 v0, -1, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v1, s[4:5] offset:16 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB120_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8493,11 +10593,32 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: .LBB121_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB121_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8511,22 +10632,58 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB121_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB121_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_add_u32_e32 v2, -1, v3 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v0, v1, s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB121_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -8545,11 +10702,32 @@ ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: .LBB122_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc +; SI-NEXT: v_mov_b32_e32 v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB122_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_mov_b32_e32 v0, v2 ; SI-NEXT: v_readlane_b32 s7, v1, 1 ; SI-NEXT: v_readlane_b32 s6, v1, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8561,26 +10739,62 @@ ; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_add_u32 s34, s4, 16 -; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: s_add_u32 s36, s4, 16 +; VI-NEXT: s_addc_u32 s37, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s36 +; VI-NEXT: v_mov_b32_e32 v1, s37 +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b64 s[38:39], 0 +; VI-NEXT: .LBB122_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, -1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v2, s36 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, s37 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; VI-NEXT: s_cbranch_execnz .LBB122_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[38:39] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_add_u32_e32 v2, -1, v3 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB122_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst Index: llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -1713,27 +1713,79 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB24_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; SI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB24_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB24_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB24_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB24_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -1747,10 +1799,30 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB25_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; SI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB25_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1759,18 +1831,49 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB25_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB25_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB25_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst @@ -1781,33 +1884,91 @@ ; SI-LABEL: global_atomic_sub_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB26_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_sub_i32_e32 v8, vcc, v10, v5 +; SI-NEXT: v_subb_u32_e32 v9, vcc, v11, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB26_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB26_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB26_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB26_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -1817,36 +1978,91 @@ ; SI-LABEL: global_atomic_sub_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB27_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_sub_i32_e32 v8, vcc, v10, v5 +; SI-NEXT: v_subb_u32_e32 v9, vcc, v11, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB27_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB27_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_sub_u32_e32 v6, vcc, v8, v2 +; VI-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB27_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB27_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst @@ -1863,16 +2079,35 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB28_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v2, s35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_subrev_i32_e32 v1, vcc, s34, v3 +; SI-NEXT: v_subb_u32_e32 v2, vcc, v4, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB28_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1884,26 +2119,56 @@ ; VI-LABEL: global_atomic_sub_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB28_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB28_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB28_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -1919,14 +2184,35 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[4:7], 0 offset:32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB29_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_mov_b32_e32 v2, s35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_subrev_i32_e32 v1, vcc, s34, v3 +; SI-NEXT: v_subb_u32_e32 v2, vcc, v4, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB29_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1940,26 +2226,56 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB29_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB29_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB29_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst @@ -1976,16 +2292,37 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB30_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 ; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_subrev_i32_e32 v5, vcc, s34, v7 +; SI-NEXT: v_subb_u32_e32 v6, vcc, v8, v0, vcc +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB30_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -1997,26 +2334,56 @@ ; VI-LABEL: global_atomic_sub_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB30_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB30_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s6, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v6, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB30_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -2032,14 +2399,37 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB31_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_subrev_i32_e32 v5, vcc, s34, v7 +; SI-NEXT: v_subb_u32_e32 v6, vcc, v8, v0, vcc +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB31_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2053,26 +2443,56 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB31_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB31_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s6, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v6, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB31_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst @@ -2091,27 +2511,79 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB32_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, v7, v3 +; SI-NEXT: v_and_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB32_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB32_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v5, v7, v3 +; VI-NEXT: v_and_b32_e32 v4, v6, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB32_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB32_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -2125,10 +2597,30 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB33_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, v7, v3 +; SI-NEXT: v_and_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB33_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2137,18 +2629,49 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB33_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v5, v7, v3 +; VI-NEXT: v_and_b32_e32 v4, v6, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB33_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB33_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst @@ -2159,33 +2682,91 @@ ; SI-LABEL: global_atomic_and_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB34_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_and_b32_e32 v9, v11, v4 +; SI-NEXT: v_and_b32_e32 v8, v10, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB34_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB34_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_and_b32_e32 v5, v7, v3 +; VI-NEXT: v_and_b32_e32 v4, v6, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB34_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB34_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -2195,36 +2776,91 @@ ; SI-LABEL: global_atomic_and_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB35_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_and_b32_e32 v9, v11, v4 +; SI-NEXT: v_and_b32_e32 v8, v10, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB35_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB35_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_and_b32_e32 v7, v9, v3 +; VI-NEXT: v_and_b32_e32 v6, v8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB35_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB35_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst @@ -2241,16 +2877,34 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB36_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, s35, v4 +; SI-NEXT: v_and_b32_e32 v1, s34, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB36_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2262,26 +2916,54 @@ ; VI-LABEL: global_atomic_and_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB36_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v1, s7, v3 +; VI-NEXT: v_and_b32_e32 v0, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB36_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB36_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -2297,14 +2979,34 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB37_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, s35, v4 +; SI-NEXT: v_and_b32_e32 v1, s34, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB37_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2318,26 +3020,54 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB37_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v1, s7, v3 +; VI-NEXT: v_and_b32_e32 v0, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB37_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB37_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst @@ -2354,16 +3084,36 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB38_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, s35, v8 +; SI-NEXT: v_and_b32_e32 v5, s34, v7 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB38_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2375,26 +3125,54 @@ ; VI-LABEL: global_atomic_and_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB38_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_and_b32_e32 v1, s7, v3 +; VI-NEXT: v_and_b32_e32 v0, s6, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB38_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_and_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_and_b32_e32 v3, s6, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB38_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -2410,14 +3188,36 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB39_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, s35, v8 +; SI-NEXT: v_and_b32_e32 v5, s34, v7 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB39_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -2431,26 +3231,54 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB39_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_and_b32_e32 v1, s7, v3 +; VI-NEXT: v_and_b32_e32 v0, s6, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB39_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_and_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_and_b32_e32 v3, s6, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB39_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst @@ -3303,27 +4131,79 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB48_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB48_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB48_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v5, v7, v3 +; VI-NEXT: v_or_b32_e32 v4, v6, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB48_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB48_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -3337,10 +4217,30 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB49_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB49_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3349,18 +4249,49 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB49_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v5, v7, v3 +; VI-NEXT: v_or_b32_e32 v4, v6, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB49_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB49_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst @@ -3371,33 +4302,91 @@ ; SI-LABEL: global_atomic_or_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB50_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_or_b32_e32 v9, v11, v4 +; SI-NEXT: v_or_b32_e32 v8, v10, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB50_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB50_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_or_b32_e32 v5, v7, v3 +; VI-NEXT: v_or_b32_e32 v4, v6, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB50_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB50_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -3407,36 +4396,91 @@ ; SI-LABEL: global_atomic_or_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB51_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_or_b32_e32 v9, v11, v4 +; SI-NEXT: v_or_b32_e32 v8, v10, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB51_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB51_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_or_b32_e32 v7, v9, v3 +; VI-NEXT: v_or_b32_e32 v6, v8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB51_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB51_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB51_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst @@ -3453,16 +4497,34 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB52_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, s35, v4 +; SI-NEXT: v_or_b32_e32 v1, s34, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB52_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3474,26 +4536,54 @@ ; VI-LABEL: global_atomic_or_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB52_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v1, s7, v3 +; VI-NEXT: v_or_b32_e32 v0, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB52_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB52_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -3509,14 +4599,34 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB53_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, s35, v4 +; SI-NEXT: v_or_b32_e32 v1, s34, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB53_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3530,26 +4640,54 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB53_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v1, s7, v3 +; VI-NEXT: v_or_b32_e32 v0, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB53_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB53_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst @@ -3566,16 +4704,36 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB54_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v6, s35, v8 +; SI-NEXT: v_or_b32_e32 v5, s34, v7 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB54_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3587,26 +4745,54 @@ ; VI-LABEL: global_atomic_or_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB54_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_or_b32_e32 v1, s7, v3 +; VI-NEXT: v_or_b32_e32 v0, s6, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB54_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_or_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_or_b32_e32 v3, s6, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB54_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -3622,14 +4808,36 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB55_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v6, s35, v8 +; SI-NEXT: v_or_b32_e32 v5, s34, v7 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB55_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3643,26 +4851,54 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB55_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_or_b32_e32 v1, s7, v3 +; VI-NEXT: v_or_b32_e32 v0, s6, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB55_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_or_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_or_b32_e32 v3, s6, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB55_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst @@ -3681,27 +4917,79 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB56_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v5, v7, v3 +; SI-NEXT: v_xor_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB56_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB56_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v5, v7, v3 +; VI-NEXT: v_xor_b32_e32 v4, v6, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB56_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB56_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -3715,10 +5003,30 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB57_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v5, v7, v3 +; SI-NEXT: v_xor_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB57_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3727,18 +5035,49 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB57_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v5, v7, v3 +; VI-NEXT: v_xor_b32_e32 v4, v6, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB57_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB57_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst @@ -3749,33 +5088,91 @@ ; SI-LABEL: global_atomic_xor_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB58_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_xor_b32_e32 v9, v11, v4 +; SI-NEXT: v_xor_b32_e32 v8, v10, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB58_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB58_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_xor_b32_e32 v5, v7, v3 +; VI-NEXT: v_xor_b32_e32 v4, v6, v2 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB58_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB58_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -3785,36 +5182,91 @@ ; SI-LABEL: global_atomic_xor_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v5, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB59_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_xor_b32_e32 v9, v11, v4 +; SI-NEXT: v_xor_b32_e32 v8, v10, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB59_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB59_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_xor_b32_e32 v7, v9, v3 +; VI-NEXT: v_xor_b32_e32 v6, v8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB59_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB59_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB59_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst @@ -3831,16 +5283,34 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB60_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v2, s35, v4 +; SI-NEXT: v_xor_b32_e32 v1, s34, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB60_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3852,26 +5322,54 @@ ; VI-LABEL: global_atomic_xor_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB60_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v1, s7, v3 +; VI-NEXT: v_xor_b32_e32 v0, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB60_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB60_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -3887,14 +5385,34 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB61_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v2, s35, v4 +; SI-NEXT: v_xor_b32_e32 v1, s34, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB61_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3908,26 +5426,54 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB61_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v1, s7, v3 +; VI-NEXT: v_xor_b32_e32 v0, s6, v2 +; VI-NEXT: v_mov_b32_e32 v5, s35 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB61_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB61_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst @@ -3944,16 +5490,36 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB62_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v6, s35, v8 +; SI-NEXT: v_xor_b32_e32 v5, s34, v7 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB62_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -3965,26 +5531,54 @@ ; VI-LABEL: global_atomic_xor_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB62_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_xor_b32_e32 v1, s7, v3 +; VI-NEXT: v_xor_b32_e32 v0, s6, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB62_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_xor_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_xor_b32_e32 v3, s6, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB62_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -4000,14 +5594,36 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB63_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v6, s35, v8 +; SI-NEXT: v_xor_b32_e32 v5, s34, v7 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB63_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -4021,26 +5637,54 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB63_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_xor_b32_e32 v1, s7, v3 +; VI-NEXT: v_xor_b32_e32 v0, s6, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB63_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_xor_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_xor_b32_e32 v3, s6, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB63_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst @@ -8733,27 +10377,88 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB107_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB107_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB107_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB107_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB107_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -8767,10 +10472,33 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB108_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB108_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8779,18 +10507,55 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB108_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB108_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB108_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -8801,33 +10566,100 @@ ; SI-LABEL: global_atomic_uinc_wrap_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB109_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v11, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB109_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB109_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB109_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB109_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -8837,36 +10669,100 @@ ; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB110_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v11, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB110_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB110_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v8 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB110_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB110_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB110_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -8883,16 +10779,37 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB111_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v3 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[3:4] +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB111_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8904,26 +10821,60 @@ ; VI-LABEL: global_atomic_uinc_wrap_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB111_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB111_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB111_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -8939,14 +10890,37 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB112_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v3 +; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[3:4] +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB112_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -8960,26 +10934,60 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB112_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB112_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB112_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -8996,16 +11004,39 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB113_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v7 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[7:8] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB113_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -9017,26 +11048,60 @@ ; VI-LABEL: global_atomic_uinc_wrap_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB113_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB113_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB113_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -9052,14 +11117,39 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB114_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v7 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[7:8] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e32 v6, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB114_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -9073,26 +11163,60 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: v_mov_b32_e32 v0, s34 +; VI-NEXT: v_mov_b32_e32 v1, s35 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB114_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB114_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB114_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9107,31 +11231,98 @@ ; SI-LABEL: global_atomic_udec_wrap_i64_noret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB115_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, -1, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB115_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: .LBB115_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; VI-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB115_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB115_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB115_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -9141,14 +11332,39 @@ ; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[8:11], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB116_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, -1, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB116_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -9157,18 +11373,59 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: .LBB116_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; VI-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB116_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB116_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB116_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9179,33 +11436,106 @@ ; SI-LABEL: global_atomic_udec_wrap_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB117_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v11, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[4:5] +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, v1, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB117_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: .LBB117_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; VI-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB117_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB117_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -9215,36 +11545,106 @@ ; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[8:11], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB118_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v11, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[4:5] +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, v1, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[8:11], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB118_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: .LBB118_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; VI-NEXT: v_add_u32_e64 v0, s[6:7], -1, v8 +; VI-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB118_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB118_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9261,16 +11661,41 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s35 -; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: .LBB119_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, -1, v3 +; SI-NEXT: v_addc_u32_e32 v2, vcc, -1, v4, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[3:4] +; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[3:4] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, s35 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[1:2], off, s[4:7], 0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB119_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -9282,26 +11707,68 @@ ; VI-LABEL: global_atomic_udec_wrap_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: .LBB119_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB119_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB119_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -9317,14 +11784,41 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v0, s6, 0 ; SI-NEXT: v_writelane_b32 v0, s7, 1 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: .LBB120_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, -1, v3 +; SI-NEXT: v_addc_u32_e32 v2, vcc, -1, v4, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[3:4] +; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[3:4] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, s35 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[1:2], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB120_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] ; SI-NEXT: v_readlane_b32 s7, v0, 1 ; SI-NEXT: v_readlane_b32 s6, v0, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -9336,28 +11830,70 @@ ; VI-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_add_u32 s34, s4, 32 -; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: s_add_u32 s36, s4, 32 +; VI-NEXT: s_addc_u32 s37, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s36 +; VI-NEXT: v_mov_b32_e32 v1, s37 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_mov_b64 s[38:39], 0 +; VI-NEXT: .LBB120_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v4, s36 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v5, s37 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; VI-NEXT: s_cbranch_execnz .LBB120_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[38:39] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB120_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9374,16 +11910,43 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: .LBB121_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v7 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v8, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[7:8] +; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[7:8] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s35 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v6, v1, v3, vcc ; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB121_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -9395,26 +11958,68 @@ ; VI-LABEL: global_atomic_udec_wrap_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[38:39], 0 +; VI-NEXT: .LBB121_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v7, s[36:37], -1, v2 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; VI-NEXT: s_cbranch_execnz .LBB121_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[38:39] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[5:6] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[5:6] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[36:37], -1, v5 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[36:37], -1, v6, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB121_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -9430,14 +12035,43 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v2, s6, 0 ; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: .LBB122_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v7 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v8, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[7:8] +; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[7:8] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s35 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v6, v1, v3, vcc +; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: v_mov_b32_e32 v4, v6 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v6, v8 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8] +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB122_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: v_readlane_b32 s7, v2, 1 ; SI-NEXT: v_readlane_b32 s6, v2, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -9449,28 +12083,70 @@ ; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_add_u32 s34, s4, 32 -; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 +; VI-NEXT: s_add_u32 s38, s4, 32 +; VI-NEXT: s_addc_u32 s39, s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s38 +; VI-NEXT: v_mov_b32_e32 v1, s39 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_mov_b64 s[40:41], 0 +; VI-NEXT: .LBB122_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v7, s[36:37], -1, v2 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v6, s6 +; VI-NEXT: v_mov_b32_e32 v4, s38 +; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v5, s39 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; VI-NEXT: s_andn2_b64 exec, exec, s[40:41] +; VI-NEXT: s_cbranch_execnz .LBB122_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[40:41] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[5:6] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[5:6] +; GFX9-NEXT: v_add_co_u32_e64 v3, s[36:37], -1, v5 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[36:37], -1, v6, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB122_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst Index: llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll =================================================================== --- llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll +++ llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll @@ -153,8 +153,17 @@ ; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] ; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[INV_MASK]], [[VALOPERAND_SHIFTED]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[ANDOPERAND]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; CHECK-NEXT: ret i16 [[EXTRACTED]] ; @@ -206,8 +215,17 @@ ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; CHECK-NEXT: ret i16 [[EXTRACTED]] ; @@ -226,8 +244,17 @@ ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; CHECK-NEXT: ret i16 [[EXTRACTED]] ; Index: llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll =================================================================== --- llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll +++ llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll @@ -162,8 +162,17 @@ ; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] ; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[INV_MASK]], [[VALOPERAND_SHIFTED]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[ANDOPERAND]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; CHECK-NEXT: ret i8 [[EXTRACTED]] ; @@ -215,8 +224,17 @@ ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; CHECK-NEXT: ret i8 [[EXTRACTED]] ; @@ -235,8 +253,17 @@ ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; CHECK-NEXT: ret i8 [[EXTRACTED]] ;