diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h --- a/llvm/include/llvm/CodeGen/MachineMemOperand.h +++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h @@ -282,17 +282,7 @@ /// success and failure orderings for an atomic operation. (For operations /// other than cmpxchg, this is equivalent to getSuccessOrdering().) AtomicOrdering getMergedOrdering() const { - AtomicOrdering Ordering = getSuccessOrdering(); - AtomicOrdering FailureOrdering = getFailureOrdering(); - if (FailureOrdering == AtomicOrdering::SequentiallyConsistent) - return AtomicOrdering::SequentiallyConsistent; - if (FailureOrdering == AtomicOrdering::Acquire) { - if (Ordering == AtomicOrdering::Monotonic) - return AtomicOrdering::Acquire; - if (Ordering == AtomicOrdering::Release) - return AtomicOrdering::AcquireRelease; - } - return Ordering; + return getMergedAtomicOrdering(getSuccessOrdering(), getFailureOrdering()); } bool isLoad() const { return FlagVals & MOLoad; } diff --git a/llvm/include/llvm/Support/AtomicOrdering.h b/llvm/include/llvm/Support/AtomicOrdering.h --- a/llvm/include/llvm/Support/AtomicOrdering.h +++ b/llvm/include/llvm/Support/AtomicOrdering.h @@ -133,6 +133,16 @@ return isAtLeastOrStrongerThan(AO, AtomicOrdering::Release); } +/// Return a single atomic ordering that is at least as strong as both the \p AO +/// and \p Other orderings for an atomic operation. +inline AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, + AtomicOrdering Other) { + if ((AO == AtomicOrdering::Acquire && Other == AtomicOrdering::Release) || + (AO == AtomicOrdering::Release && Other == AtomicOrdering::Acquire)) + return AtomicOrdering::AcquireRelease; + return isStrongerThan(AO, Other) ? AO : Other; +} + inline AtomicOrderingCABI toCABI(AtomicOrdering AO) { static const AtomicOrderingCABI lookup[8] = { /* NotAtomic */ AtomicOrderingCABI::relaxed, diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -126,8 +126,7 @@ (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != SIAtomicAddrSpace::NONE && (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) != - SIAtomicAddrSpace::NONE && - !isStrongerThan(FailureOrdering, Ordering)); + SIAtomicAddrSpace::NONE); // There is also no cross address space ordering if the ordering // address space is the same as the instruction address space and @@ -651,14 +650,11 @@ } SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); - Ordering = isStrongerThan(Ordering, OpOrdering) - ? Ordering - : MMO->getSuccessOrdering(); + Ordering = getMergedAtomicOrdering(Ordering, OpOrdering); assert(MMO->getFailureOrdering() != AtomicOrdering::Release && MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); FailureOrdering = - isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ? - FailureOrdering : MMO->getFailureOrdering(); + getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering()); } } diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -1887,6 +1887,106 @@ ret void } +define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( +; GFX7-LABEL: flat_agent_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire + ret void +} + define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry @@ -2311,8 +2411,8 @@ ret void } -define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( -; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( +; GFX7-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2329,7 +2429,7 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2350,7 +2450,7 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2371,7 +2471,7 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2387,7 +2487,7 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2400,7 +2500,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2415,211 +2515,195 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst ret void } -define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( +; GFX7-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst ret void } -define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( +; GFX7-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2627,13 +2711,12 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2641,107 +2724,94 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release seq_cst ret void } -define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( +; GFX7-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2749,13 +2819,12 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2763,136 +2832,127 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst ret void } -define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst ret void } -define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2903,16 +2963,14 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2924,18 +2982,14 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2947,18 +3001,14 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2969,52 +3019,47 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3025,7 +3070,6 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -3034,7 +3078,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -3046,8 +3090,6 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -3057,7 +3099,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -3069,8 +3111,6 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv @@ -3080,7 +3120,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3091,7 +3131,6 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3099,28 +3138,26 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -3129,14 +3166,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3149,14 +3186,13 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -3171,15 +3207,13 @@ ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -3194,15 +3228,13 @@ ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3215,13 +3247,13 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3231,11 +3263,10 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3245,20 +3276,19 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( -; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3278,7 +3308,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -3301,7 +3331,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -3324,7 +3354,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3343,7 +3373,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3357,7 +3387,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3373,1417 +3403,3851 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_agent_one_as_unordered_load( -; GFX7-LABEL: flat_agent_one_as_unordered_load: +define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load: +; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_unordered_load: +; GFX10-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_unordered_load: +; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_unordered_load( +; GFX7-LABEL: flat_agent_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent-one-as") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_monotonic_load( +; GFX7-LABEL: flat_agent_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent-one-as") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_acquire_load( +; GFX7-LABEL: flat_agent_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent-one-as") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( +; GFX7-LABEL: flat_agent_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent-one-as") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_unordered_store( +; GFX7-LABEL: flat_agent_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_monotonic_store( +; GFX7-LABEL: flat_agent_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_release_store( +; GFX7-LABEL: flat_agent_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( +; GFX7-LABEL: flat_agent_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( +; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") monotonic + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( +; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( +; GFX7-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") release + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %in, i32* %out) { + i32* %out, i32 %in) { entry: - %val = load atomic i32, i32* %in syncscope("agent-one-as") unordered, align 4 - store i32 %val, i32* %out + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel ret void } -define amdgpu_kernel void @flat_agent_one_as_monotonic_load( -; GFX7-LABEL: flat_agent_one_as_monotonic_load: +define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load: +; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load: +; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_load: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %in, i32* %out) { + i32* %out, i32 %in) { entry: - %val = load atomic i32, i32* %in syncscope("agent-one-as") monotonic, align 4 - store i32 %val, i32* %out + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst ret void } -define amdgpu_kernel void @flat_agent_one_as_acquire_load( -; GFX7-LABEL: flat_agent_one_as_acquire_load: +define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load: +; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_acquire_load: +; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_load: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %in, i32* %out) { + i32* %out, i32 %in) { entry: - %val = load atomic i32, i32* %in syncscope("agent-one-as") acquire, align 4 - store i32 %val, i32* %out + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire + store i32 %val, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( -; GFX7-LABEL: flat_agent_one_as_seq_cst_load: +define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load: +; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load: +; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_load: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %in, i32* %out) { + i32* %out, i32 %in) { entry: - %val = load atomic i32, i32* %in syncscope("agent-one-as") seq_cst, align 4 - store i32 %val, i32* %out + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel + store i32 %val, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_agent_one_as_unordered_store( -; GFX7-LABEL: flat_agent_one_as_unordered_store: +define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store: +; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_unordered_store: +; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_unordered_store: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32* %out) { + i32* %out, i32 %in) { entry: - store atomic i32 %in, i32* %out syncscope("agent-one-as") unordered, align 4 + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst + store i32 %val, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_agent_one_as_monotonic_store( -; GFX7-LABEL: flat_agent_one_as_monotonic_store: +define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store: +; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store: +; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_store: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32* %out) { + i32* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32* %out syncscope("agent-one-as") monotonic, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic ret void } -define amdgpu_kernel void @flat_agent_one_as_release_store( -; GFX7-LABEL: flat_agent_one_as_release_store: +define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_release_store: +; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_release_store: +; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_store: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32* %out) { + i32* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32* %out syncscope("agent-one-as") release, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic ret void } -define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( -; GFX7-LABEL: flat_agent_one_as_seq_cst_store: +define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store: +; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store: +; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_store: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32* %out) { + i32* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32* %out syncscope("agent-one-as") seq_cst, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic ret void } -define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( -; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw: +define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic ret void } -define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( -; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw: +define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic ret void } -define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( -; GFX7-LABEL: flat_agent_one_as_release_atomicrmw: +define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") release + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire ret void } -define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( -; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire ret void } -define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( -; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw: +define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw: +; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw: +; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire ret void } -define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( -; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: +define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: +; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: +; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire - store i32 %val, i32* %out, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire ret void } -define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( -; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: +define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel - store i32 %val, i32* %out, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire ret void } -define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( -; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: +define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst - store i32 %val, i32* %out, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst ret void } -define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( -; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: +define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4794,10 +7258,13 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4809,10 +7276,15 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4824,10 +7296,15 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4838,37 +7315,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst ret void } -define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( -; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: +define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4879,12 +7364,13 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4896,13 +7382,15 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4914,13 +7402,15 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4931,29 +7421,32 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -4961,12 +7454,12 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst ret void } -define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( -; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: +define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4979,9 +7472,11 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4996,9 +7491,12 @@ ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5013,9 +7511,12 @@ ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5028,9 +7529,10 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5039,9 +7541,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5050,16 +7554,18 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst ret void } -define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( -; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: +define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5076,7 +7582,7 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5096,7 +7602,7 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5116,7 +7622,7 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5132,7 +7638,7 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5145,7 +7651,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5160,289 +7666,442 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst ret void } -define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( -; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: +define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( -; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: +define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( -; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg: +define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5450,12 +8109,14 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5463,92 +8124,111 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( -; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: +define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5556,12 +8236,14 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5569,231 +8251,260 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( -; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: +define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( -; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5804,6 +8515,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -5813,7 +8525,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5825,6 +8537,8 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -5835,7 +8549,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5847,6 +8561,8 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv @@ -5857,7 +8573,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5868,6 +8584,7 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -5876,13 +8593,14 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -5890,13 +8608,14 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -5905,14 +8624,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5933,7 +8652,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5957,7 +8676,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5981,7 +8700,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6001,7 +8720,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6016,7 +8735,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6032,14 +8751,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6060,7 +8779,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6084,7 +8803,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6108,7 +8827,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6128,7 +8847,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6143,7 +8862,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6159,14 +8878,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6177,6 +8896,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -6186,7 +8906,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6198,6 +8918,8 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -6208,7 +8930,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6220,6 +8942,8 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv @@ -6230,7 +8954,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6241,6 +8965,7 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -6249,13 +8974,14 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -6263,13 +8989,14 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -6278,14 +9005,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6306,7 +9033,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6330,7 +9057,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6354,7 +9081,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6374,7 +9101,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6389,7 +9116,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6405,14 +9132,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6433,7 +9160,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6457,7 +9184,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6481,7 +9208,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6501,7 +9228,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6516,7 +9243,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6532,14 +9259,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6560,7 +9287,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6584,7 +9311,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6608,7 +9335,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6628,7 +9355,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6643,7 +9370,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6659,7 +9386,7 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -84,8 +84,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4 @@ -171,8 +169,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4 @@ -258,8 +254,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4 @@ -345,8 +339,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4 @@ -420,8 +412,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4 @@ -494,8 +484,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4 @@ -568,8 +556,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4 @@ -642,8 +628,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4 @@ -716,8 +700,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") monotonic @@ -790,8 +772,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire @@ -864,8 +844,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") release @@ -938,8 +916,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel @@ -1012,8 +988,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst @@ -1098,8 +1072,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire @@ -1185,8 +1157,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel @@ -1272,8 +1242,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst @@ -1359,8 +1327,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1446,8 +1412,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1533,8 +1497,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1620,8 +1582,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1707,12 +1667,95 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( +; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_endpgm ; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire ret void } @@ -1794,8 +1837,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1881,8 +1922,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1968,8 +2007,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2055,8 +2092,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2064,8 +2099,8 @@ ret void } -define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( -; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( +; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2079,7 +2114,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2094,7 +2129,7 @@ ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2109,7 +2144,7 @@ ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2123,7 +2158,7 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2133,7 +2168,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2142,453 +2177,355 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst ret void } -define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( +; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst ret void } -define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( +; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 -; GFX10-CU-NEXT: s_endpgm +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst ret void } -define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( +; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst ret void } -define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst ret void } -define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2606,7 +2543,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2625,7 +2562,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2644,7 +2581,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2662,7 +2599,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2674,7 +2611,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2685,19 +2622,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2715,7 +2650,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2734,7 +2669,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2753,7 +2688,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2771,7 +2706,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2783,7 +2718,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2794,19 +2729,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2824,7 +2757,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2843,7 +2776,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2862,7 +2795,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2880,7 +2813,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2892,7 +2825,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2903,19 +2836,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( -; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2933,7 +2864,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2952,7 +2883,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2971,7 +2902,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2989,7 +2920,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3001,7 +2932,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3012,1294 +2943,3372 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( -; GFX7-LABEL: flat_singlethread_one_as_unordered_load: +define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load: +; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load: +; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_load: +; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %in, i32* %out) { + i32* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32* %in syncscope("singlethread-one-as") unordered, align 4 - store i32 %val, i32* %out + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( -; GFX7-LABEL: flat_singlethread_one_as_monotonic_load: +define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_load: +; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( +; GFX7-LABEL: flat_singlethread_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread-one-as") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( +; GFX7-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread-one-as") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( +; GFX7-LABEL: flat_singlethread_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread-one-as") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( +; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread-one-as") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( +; GFX7-LABEL: flat_singlethread_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( +; GFX7-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_release_store( +; GFX7-LABEL: flat_singlethread_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( +; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( +; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") monotonic + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( +; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( +; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") release + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %in, i32* %out) { + i32* %out, i32 %in) { entry: - %val = load atomic i32, i32* %in syncscope("singlethread-one-as") monotonic, align 4 - store i32 %val, i32* %out + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst ret void } -define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( -; GFX7-LABEL: flat_singlethread_one_as_acquire_load: +define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load: +; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load: +; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_load: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %in, i32* %out) { + i32* %out, i32 %in) { entry: - %val = load atomic i32, i32* %in syncscope("singlethread-one-as") acquire, align 4 - store i32 %val, i32* %out + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire + store i32 %val, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( -; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load: +define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_load: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %in, i32* %out) { + i32* %out, i32 %in) { entry: - %val = load atomic i32, i32* %in syncscope("singlethread-one-as") seq_cst, align 4 - store i32 %val, i32* %out + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel + store i32 %val, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( -; GFX7-LABEL: flat_singlethread_one_as_unordered_store: +define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store: +; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store: +; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_store: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 %in, i32* %out) { + i32* %out, i32 %in) { entry: - store atomic i32 %in, i32* %out syncscope("singlethread-one-as") unordered, align 4 + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst + store i32 %val, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( -; GFX7-LABEL: flat_singlethread_one_as_monotonic_store: +define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_store: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 %in, i32* %out) { + i32* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32* %out syncscope("singlethread-one-as") monotonic, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic ret void } -define amdgpu_kernel void @flat_singlethread_one_as_release_store( -; GFX7-LABEL: flat_singlethread_one_as_release_store: +define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store: +; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_release_store: +; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_store: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 %in, i32* %out) { + i32* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32* %out syncscope("singlethread-one-as") release, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic ret void } -define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( -; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store: +define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_store: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 %in, i32* %out) { + i32* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32* %out syncscope("singlethread-one-as") seq_cst, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic ret void } -define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( -; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic ret void } -define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( -; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic ret void } -define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( -; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw: +define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") release + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire ret void } -define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( -; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire ret void } -define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( -; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire ret void } -define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( -; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: +define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: +; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: +; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire - store i32 %val, i32* %out, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire ret void } -define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( -; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: +define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel - store i32 %val, i32* %out, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire ret void } -define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( -; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: +define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst - store i32 %val, i32* %out, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst ret void } -define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( -; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: +define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4313,7 +6322,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4328,7 +6337,7 @@ ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4343,7 +6352,7 @@ ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4357,7 +6366,7 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4367,7 +6376,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4376,17 +6385,15 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst ret void } -define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( -; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: +define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4400,7 +6407,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4415,7 +6422,7 @@ ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4430,7 +6437,7 @@ ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4444,7 +6451,7 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4454,7 +6461,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4463,17 +6470,15 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst ret void } -define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( -; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: +define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4487,7 +6492,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4502,7 +6507,7 @@ ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4517,7 +6522,7 @@ ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4531,7 +6536,7 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4541,7 +6546,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4550,17 +6555,15 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst ret void } -define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( -; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: +define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4574,7 +6577,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4589,7 +6592,7 @@ ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4604,7 +6607,7 @@ ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4618,7 +6621,7 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4628,7 +6631,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4637,539 +6640,764 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm ; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( -; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: +define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( -; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: +define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( -; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: +define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( -; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: +define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( -; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: +define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( -; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5187,7 +7415,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5206,7 +7434,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5225,7 +7453,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5243,7 +7471,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5255,7 +7483,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5266,19 +7494,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5296,7 +7522,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5315,7 +7541,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5334,7 +7560,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5352,7 +7578,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5364,7 +7590,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5375,19 +7601,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5405,7 +7629,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5424,7 +7648,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5443,7 +7667,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5461,7 +7685,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5473,7 +7697,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5484,19 +7708,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5514,7 +7736,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5533,7 +7755,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5552,7 +7774,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5570,7 +7792,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5582,7 +7804,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5593,19 +7815,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5623,7 +7843,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5642,7 +7862,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5661,7 +7881,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5679,7 +7899,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5691,7 +7911,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5702,19 +7922,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5732,7 +7950,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5751,7 +7969,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5770,7 +7988,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5788,7 +8006,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5800,7 +8018,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5811,19 +8029,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5841,7 +8057,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5860,7 +8076,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5879,7 +8095,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5897,7 +8113,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5909,7 +8125,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5920,12 +8136,10 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void @@ -6029,8 +8243,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -1929,6 +1929,108 @@ ret void } +define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( +; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic acquire + ret void +} + define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry @@ -2367,8 +2469,8 @@ ret void } -define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( -; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( +; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2385,7 +2487,7 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2406,7 +2508,7 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2427,7 +2529,7 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2443,7 +2545,7 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2458,7 +2560,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2475,213 +2577,199 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic seq_cst ret void } -define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( +; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire seq_cst ret void } -define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( +; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2690,14 +2778,13 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2706,108 +2793,95 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release seq_cst ret void } -define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( +; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2816,14 +2890,13 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2832,139 +2905,132 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel seq_cst ret void } -define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst ret void } -define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2975,16 +3041,14 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2996,18 +3060,14 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -3019,18 +3079,14 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3041,56 +3097,47 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3101,7 +3148,6 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -3110,7 +3156,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -3122,8 +3168,6 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -3133,7 +3177,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -3145,8 +3189,6 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv @@ -3156,7 +3198,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3167,7 +3209,6 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3175,15 +3216,13 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 @@ -3191,15 +3230,13 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 @@ -3209,14 +3246,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3229,14 +3266,13 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -3251,15 +3287,13 @@ ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -3274,15 +3308,13 @@ ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3295,13 +3327,13 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3312,12 +3344,10 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3328,21 +3358,19 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( -; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3362,7 +3390,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -3385,7 +3413,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -3408,7 +3436,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3427,7 +3455,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3443,7 +3471,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3461,1447 +3489,4063 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_one_as_unordered_load( -; GFX7-LABEL: flat_system_one_as_unordered_load: +define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_unordered_load: +; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_unordered_load: +; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_unordered_load: +; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load: +; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %in, i32* %out) { + i32* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32* %in syncscope("one-as") unordered, align 4 - store i32 %val, i32* %out + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_one_as_monotonic_load( -; GFX7-LABEL: flat_system_one_as_monotonic_load: +define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load: +; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_monotonic_load: +; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_load: +; SKIP-CACHE-INV-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load: +; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %in, i32* %out) { + i32* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32* %in syncscope("one-as") monotonic, align 4 - store i32 %val, i32* %out - ret void + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_unordered_load( +; GFX7-LABEL: flat_system_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_one_as_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("one-as") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_system_one_as_monotonic_load( +; GFX7-LABEL: flat_system_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("one-as") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_system_one_as_acquire_load( +; GFX7-LABEL: flat_system_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("one-as") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_system_one_as_seq_cst_load( +; GFX7-LABEL: flat_system_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("one-as") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_system_one_as_unordered_store( +; GFX7-LABEL: flat_system_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_one_as_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_monotonic_store( +; GFX7-LABEL: flat_system_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_release_store( +; GFX7-LABEL: flat_system_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_seq_cst_store( +; GFX7-LABEL: flat_system_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( +; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( +; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire + ret void +} + +define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( +; GFX7-LABEL: flat_system_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") release + ret void +} + +define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel + ret void +} + +define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst + ret void } -define amdgpu_kernel void @flat_system_one_as_acquire_load( -; GFX7-LABEL: flat_system_one_as_acquire_load: +define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_acquire_load: +; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_acquire_load: +; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_load: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %in, i32* %out) { + i32* %out, i32 %in) { entry: - %val = load atomic i32, i32* %in syncscope("one-as") acquire, align 4 - store i32 %val, i32* %out + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire + store i32 %val, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_one_as_seq_cst_load( -; GFX7-LABEL: flat_system_one_as_seq_cst_load: +define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load: +; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load: +; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_load: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %in, i32* %out) { + i32* %out, i32 %in) { entry: - %val = load atomic i32, i32* %in syncscope("one-as") seq_cst, align 4 - store i32 %val, i32* %out + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst + store i32 %val, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_one_as_unordered_store( -; GFX7-LABEL: flat_system_one_as_unordered_store: +define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_unordered_store: +; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_unordered_store: +; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_unordered_store: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32* %out) { + i32* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32* %out syncscope("one-as") unordered, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic ret void } -define amdgpu_kernel void @flat_system_one_as_monotonic_store( -; GFX7-LABEL: flat_system_one_as_monotonic_store: +define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store: +; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_monotonic_store: +; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_store: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32* %out) { + i32* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32* %out syncscope("one-as") monotonic, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic ret void } -define amdgpu_kernel void @flat_system_one_as_release_store( -; GFX7-LABEL: flat_system_one_as_release_store: +define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_release_store: +; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_release_store: +; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_store: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32* %out) { + i32* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32* %out syncscope("one-as") release, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic ret void } -define amdgpu_kernel void @flat_system_one_as_seq_cst_store( -; GFX7-LABEL: flat_system_one_as_seq_cst_store: +define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store: +; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store: +; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_store: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32* %out) { + i32* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32* %out syncscope("one-as") seq_cst, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic ret void } -define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( -; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw: +define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic ret void } -define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( -; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw: +define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( +; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire ret void } -define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( -; GFX7-LABEL: flat_system_one_as_release_atomicrmw: +define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw: +; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw: +; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") release + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire ret void } -define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( -; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw: +define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release acquire ret void } -define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( -; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw: +define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire ret void } -define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( -; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw: +define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw: +; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: +; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire - store i32 %val, i32* %out, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire ret void } -define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( -; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( +; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel - store i32 %val, i32* %out, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst ret void } -define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( -; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst - store i32 %val, i32* %out, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst ret void } -define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( -; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: +define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( +; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4912,10 +7556,13 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4927,10 +7574,15 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4942,10 +7594,15 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4956,37 +7613,49 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst ret void } -define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( -; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: +define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4997,12 +7666,13 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5014,13 +7684,15 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5032,13 +7704,15 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5049,30 +7723,35 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 @@ -5081,12 +7760,12 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst ret void } -define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( -; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg: +define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5099,9 +7778,11 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5116,9 +7797,12 @@ ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5133,9 +7817,12 @@ ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5148,9 +7835,10 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5160,9 +7848,12 @@ ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5172,409 +7863,453 @@ ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst ret void } -define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( -; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: +define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( -; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: +define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( -; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: +define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( -; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg: +define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5583,13 +8318,15 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5598,93 +8335,112 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( -; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: +define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5693,13 +8449,15 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5708,240 +8466,265 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( -; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: +define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( -; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5952,6 +8735,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -5961,7 +8745,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5973,6 +8757,8 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -5983,7 +8769,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5995,6 +8781,8 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv @@ -6005,7 +8793,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6016,6 +8804,7 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -6024,13 +8813,15 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 @@ -6039,13 +8830,15 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 @@ -6055,14 +8848,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6083,7 +8876,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6107,7 +8900,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6131,7 +8924,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6151,7 +8944,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6168,7 +8961,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6186,14 +8979,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6214,7 +9007,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6238,7 +9031,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6262,7 +9055,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6282,7 +9075,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6299,7 +9092,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6317,14 +9110,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6335,6 +9128,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -6344,7 +9138,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6356,6 +9150,8 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -6366,7 +9162,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6378,6 +9174,8 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv @@ -6388,7 +9186,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6399,6 +9197,7 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -6407,13 +9206,15 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 @@ -6422,13 +9223,15 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 @@ -6438,14 +9241,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6466,7 +9269,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6490,7 +9293,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6514,7 +9317,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6534,7 +9337,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6551,7 +9354,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6569,14 +9372,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6597,7 +9400,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6621,7 +9424,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6645,7 +9448,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6665,7 +9468,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6682,7 +9485,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6700,14 +9503,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6728,7 +9531,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6752,7 +9555,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6776,7 +9579,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6796,7 +9599,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6813,7 +9616,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6831,7 +9634,7 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -84,8 +84,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4 @@ -171,8 +169,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4 @@ -258,8 +254,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4 @@ -345,8 +339,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4 @@ -420,8 +412,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4 @@ -494,8 +484,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4 @@ -568,8 +556,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4 @@ -642,8 +628,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4 @@ -716,8 +700,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") monotonic @@ -790,8 +772,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire @@ -864,8 +844,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") release @@ -938,8 +916,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel @@ -1012,8 +988,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst @@ -1098,8 +1072,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire @@ -1185,8 +1157,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel @@ -1272,8 +1242,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst @@ -1359,8 +1327,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1446,8 +1412,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1533,8 +1497,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1620,8 +1582,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1707,12 +1667,95 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( +; GFX7-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_endpgm ; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire ret void } @@ -1794,8 +1837,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1881,8 +1922,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1968,8 +2007,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2055,8 +2092,6 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2064,8 +2099,8 @@ ret void } -define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( -; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( +; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2079,7 +2114,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2094,7 +2129,7 @@ ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2109,7 +2144,7 @@ ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2123,7 +2158,7 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2133,7 +2168,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2142,453 +2177,355 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst ret void } -define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( +; GFX7-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst ret void } -define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( +; GFX7-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 -; GFX10-CU-NEXT: s_endpgm +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst ret void } -define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( +; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst ret void } -define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst ret void } -define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2606,7 +2543,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2625,7 +2562,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2644,7 +2581,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2662,7 +2599,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2674,7 +2611,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2685,19 +2622,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2715,7 +2650,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2734,7 +2669,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2753,7 +2688,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2771,7 +2706,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2783,7 +2718,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2794,19 +2729,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2824,7 +2757,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2843,7 +2776,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2862,7 +2795,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2880,7 +2813,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2892,7 +2825,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2903,19 +2836,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( -; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2933,7 +2864,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2952,7 +2883,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2971,7 +2902,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2989,7 +2920,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3001,7 +2932,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3012,1294 +2943,3287 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( -; GFX7-LABEL: flat_wavefront_one_as_unordered_load: +define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load: +; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load: +; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_unordered_load: +; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm ; +; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm ; - i32* %in, i32* %out) { +; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32* %in syncscope("wavefront-one-as") unordered, align 4 - store i32 %val, i32* %out + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( -; GFX7-LABEL: flat_wavefront_one_as_monotonic_load: +define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( +; GFX7-LABEL: flat_wavefront_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront-one-as") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( +; GFX7-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront-one-as") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( +; GFX7-LABEL: flat_wavefront_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront-one-as") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( +; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront-one-as") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( +; GFX7-LABEL: flat_wavefront_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( +; GFX7-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_release_store( +; GFX7-LABEL: flat_wavefront_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( +; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( +; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") monotonic + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( +; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( +; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") release + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_load: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %in, i32* %out) { + i32* %out, i32 %in) { entry: - %val = load atomic i32, i32* %in syncscope("wavefront-one-as") monotonic, align 4 - store i32 %val, i32* %out + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel ret void } -define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( -; GFX7-LABEL: flat_wavefront_one_as_acquire_load: +define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load: +; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load: +; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_load: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %in, i32* %out) { + i32* %out, i32 %in) { entry: - %val = load atomic i32, i32* %in syncscope("wavefront-one-as") acquire, align 4 - store i32 %val, i32* %out + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst ret void } -define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( -; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load: +define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_load: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %in, i32* %out) { + i32* %out, i32 %in) { entry: - %val = load atomic i32, i32* %in syncscope("wavefront-one-as") seq_cst, align 4 - store i32 %val, i32* %out + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire + store i32 %val, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( -; GFX7-LABEL: flat_wavefront_one_as_unordered_store: +define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store: +; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store: +; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_unordered_store: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 %in, i32* %out) { + i32* %out, i32 %in) { entry: - store atomic i32 %in, i32* %out syncscope("wavefront-one-as") unordered, align 4 + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel + store i32 %val, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( -; GFX7-LABEL: flat_wavefront_one_as_monotonic_store: +define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_store: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 %in, i32* %out) { + i32* %out, i32 %in) { entry: - store atomic i32 %in, i32* %out syncscope("wavefront-one-as") monotonic, align 4 + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst + store i32 %val, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_one_as_release_store( -; GFX7-LABEL: flat_wavefront_one_as_release_store: +define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store: +; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_release_store: +; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_store: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 %in, i32* %out) { + i32* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32* %out syncscope("wavefront-one-as") release, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic ret void } -define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( -; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store: +define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_store: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 %in, i32* %out) { + i32* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32* %out syncscope("wavefront-one-as") seq_cst, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic ret void } -define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( -; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic ret void } -define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( -; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic ret void } -define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( -; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw: +define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") release + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic ret void } -define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( -; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire ret void } -define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( -; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: +define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: +; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: +; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire ret void } -define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( -; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: +define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: +; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: +; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire - store i32 %val, i32* %out, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire ret void } -define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( -; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: +define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel - store i32 %val, i32* %out, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire ret void } -define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( -; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: +define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst - store i32 %val, i32* %out, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire ret void } -define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( -; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: +define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4313,7 +6237,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4328,7 +6252,7 @@ ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4343,7 +6267,7 @@ ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4357,7 +6281,7 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4367,7 +6291,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4376,17 +6300,15 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst ret void } -define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( -; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: +define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4400,7 +6322,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4415,7 +6337,7 @@ ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4430,7 +6352,7 @@ ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4444,7 +6366,7 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4454,7 +6376,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4463,17 +6385,15 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst ret void } -define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( -; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: +define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4487,7 +6407,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4502,7 +6422,7 @@ ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4517,7 +6437,7 @@ ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4531,7 +6451,7 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4541,7 +6461,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4550,17 +6470,15 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst ret void } -define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( -; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: +define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4574,7 +6492,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4589,7 +6507,7 @@ ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4604,7 +6522,7 @@ ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4618,7 +6536,7 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4628,7 +6546,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4637,17 +6555,15 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst ret void } -define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( -; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: +define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4661,7 +6577,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4676,7 +6592,7 @@ ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4691,7 +6607,7 @@ ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4705,7 +6621,7 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4715,7 +6631,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4724,452 +6640,657 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm ; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( -; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: +define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( -; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: +define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( -; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: +define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( -; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: +define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( -; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5187,7 +7308,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5206,7 +7327,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5225,7 +7346,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5243,7 +7364,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5255,7 +7376,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5266,19 +7387,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5296,7 +7415,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5315,7 +7434,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5334,7 +7453,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5352,7 +7471,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5364,7 +7483,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5375,19 +7494,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5405,7 +7522,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5424,7 +7541,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5443,7 +7560,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5461,7 +7578,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5473,7 +7590,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5484,19 +7601,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5514,7 +7629,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5533,7 +7648,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5552,7 +7667,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5570,7 +7685,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5582,7 +7697,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5593,19 +7708,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5623,7 +7736,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5642,7 +7755,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5661,7 +7774,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5679,7 +7792,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5691,7 +7804,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5702,19 +7815,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5732,7 +7843,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5751,7 +7862,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5770,7 +7881,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5788,7 +7899,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5800,7 +7911,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5811,19 +7922,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5841,7 +7950,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5860,7 +7969,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5879,7 +7988,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5897,7 +8006,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5909,7 +8018,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5920,12 +8029,10 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32* %out, i32 %in, i32 %old) { + i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void @@ -6029,8 +8136,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -1823,6 +1823,100 @@ ret void } +define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( +; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire + ret void +} + define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry @@ -2321,8 +2415,8 @@ ret void } -define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2334,14 +2428,13 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2354,14 +2447,13 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2374,14 +2466,13 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2393,14 +2484,13 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2412,7 +2502,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2421,20 +2511,19 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2445,7 +2534,6 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2454,7 +2542,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2466,8 +2554,6 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -2476,7 +2562,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2488,7 +2574,6 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2497,7 +2582,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2508,7 +2593,6 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -2517,27 +2601,25 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2546,14 +2628,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2566,14 +2648,13 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2588,14 +2669,13 @@ ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2609,14 +2689,13 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2629,14 +2708,13 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2649,7 +2727,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2659,20 +2737,19 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2683,6 +2760,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2691,7 +2769,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2703,6 +2781,8 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -2711,7 +2791,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2723,6 +2803,7 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2731,7 +2812,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2742,6 +2823,7 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -2750,25 +2832,27 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2777,14 +2861,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2804,7 +2888,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2826,7 +2910,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2847,7 +2931,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2867,7 +2951,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2880,7 +2964,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2896,14 +2980,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2914,7 +2998,6 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2923,7 +3006,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2935,8 +3018,6 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -2945,7 +3026,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -2957,7 +3038,6 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2966,7 +3046,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2977,7 +3057,6 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -2986,27 +3065,25 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -3015,14 +3092,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3033,7 +3110,6 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3042,7 +3118,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -3054,8 +3130,6 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -3064,7 +3138,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -3076,7 +3150,6 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3085,7 +3158,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3096,7 +3169,6 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3105,27 +3177,25 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -3134,14 +3204,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( -; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3161,7 +3231,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -3183,7 +3253,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -3204,7 +3274,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3224,7 +3294,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3237,7 +3307,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3253,1308 +3323,3133 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( -; GFX7-LABEL: flat_workgroup_one_as_unordered_load: +define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load: +; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load: +; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_unordered_load: +; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %in, i32* %out) { + i32* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32* %in syncscope("workgroup-one-as") unordered, align 4 - store i32 %val, i32* %out + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( -; GFX7-LABEL: flat_workgroup_one_as_monotonic_load: +define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_load: +; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( +; GFX7-LABEL: flat_workgroup_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup-one-as") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( +; GFX7-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup-one-as") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( +; GFX7-LABEL: flat_workgroup_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup-one-as") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( +; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup-one-as") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( +; GFX7-LABEL: flat_workgroup_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( +; GFX7-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_release_store( +; GFX7-LABEL: flat_workgroup_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( +; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( +; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") monotonic + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( +; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( +; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") release + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %in, i32* %out) { + i32* %out, i32 %in) { entry: - %val = load atomic i32, i32* %in syncscope("workgroup-one-as") monotonic, align 4 - store i32 %val, i32* %out + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst ret void } -define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( -; GFX7-LABEL: flat_workgroup_one_as_acquire_load: +define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load: +; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load: +; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_load: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %in, i32* %out) { + i32* %out, i32 %in) { entry: - %val = load atomic i32, i32* %in syncscope("workgroup-one-as") acquire, align 4 - store i32 %val, i32* %out + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire + store i32 %val, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( -; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load: +define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_load: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_load_dword v0, v[0:1] -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %in, i32* %out) { + i32* %out, i32 %in) { entry: - %val = load atomic i32, i32* %in syncscope("workgroup-one-as") seq_cst, align 4 - store i32 %val, i32* %out + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel + store i32 %val, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( -; GFX7-LABEL: flat_workgroup_one_as_unordered_store: +define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store: +; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store: +; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_unordered_store: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32* %out) { + i32* %out, i32 %in) { entry: - store atomic i32 %in, i32* %out syncscope("workgroup-one-as") unordered, align 4 + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst + store i32 %val, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( -; GFX7-LABEL: flat_workgroup_one_as_monotonic_store: +define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_store: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32* %out) { + i32* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32* %out syncscope("workgroup-one-as") monotonic, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic ret void } -define amdgpu_kernel void @flat_workgroup_one_as_release_store( -; GFX7-LABEL: flat_workgroup_one_as_release_store: +define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store: +; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_release_store: +; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_store: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32* %out) { + i32* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32* %out syncscope("workgroup-one-as") release, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic ret void } -define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( -; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store: +define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_store: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32* %out) { + i32* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32* %out syncscope("workgroup-one-as") seq_cst, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic ret void } -define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( -; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") monotonic + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic ret void } -define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( -; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic ret void } -define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( -; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw: +define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") release + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire ret void } -define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( -; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire ret void } -define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( -; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire ret void } -define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( -; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: +define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: +; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: +; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire - store i32 %val, i32* %out, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire ret void } -define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( -; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: +define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel - store i32 %val, i32* %out, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire ret void } -define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( -; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: +define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32* %out, i32 %in) { + i32* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst - store i32 %val, i32* %out, align 4 + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst ret void } -define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( -; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: +define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4568,7 +6463,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4580,10 +6475,14 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4598,7 +6497,7 @@ ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4612,7 +6511,7 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4622,24 +6521,27 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst ret void } -define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( -; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: +define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4653,7 +6555,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4665,12 +6567,14 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4685,7 +6589,7 @@ ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4699,7 +6603,7 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4709,13 +6613,14 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -4723,12 +6628,12 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst ret void } -define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( -; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: +define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4742,7 +6647,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4757,9 +6662,11 @@ ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4774,7 +6681,7 @@ ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4788,7 +6695,7 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4798,7 +6705,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4807,16 +6714,18 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst ret void } -define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( -; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: +define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4830,7 +6739,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4849,7 +6758,7 @@ ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -4864,7 +6773,7 @@ ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4878,7 +6787,7 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4888,7 +6797,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4903,265 +6812,429 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst ret void } -define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( -; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: +define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( -; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: +define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( -; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: +define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5169,91 +7242,112 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( -; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: +define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5261,203 +7355,242 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( -; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: +define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( -; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 -; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 -; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 -; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5475,7 +7608,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5487,6 +7620,8 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -5496,7 +7631,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5515,7 +7650,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5533,7 +7668,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5545,13 +7680,14 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -5560,14 +7696,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5585,7 +7721,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5608,7 +7744,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5627,7 +7763,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5645,7 +7781,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5657,7 +7793,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5673,14 +7809,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( -; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5698,7 +7834,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5721,7 +7857,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5740,7 +7876,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5758,7 +7894,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5770,7 +7906,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5786,14 +7922,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5811,7 +7947,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5823,6 +7959,8 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -5832,7 +7970,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5851,7 +7989,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5869,7 +8007,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5881,13 +8019,14 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -5896,14 +8035,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5921,7 +8060,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5944,7 +8083,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -5963,7 +8102,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5981,7 +8120,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5993,7 +8132,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6009,14 +8148,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6034,7 +8173,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6057,7 +8196,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6076,7 +8215,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6094,7 +8233,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6106,7 +8245,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6122,14 +8261,14 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void } -define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( -; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6147,7 +8286,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6170,7 +8309,7 @@ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 @@ -6189,7 +8328,7 @@ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6207,7 +8346,7 @@ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6219,7 +8358,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6235,7 +8374,7 @@ i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32* %out, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -2070,6 +2070,110 @@ ret void } +define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( +; GFX6-LABEL: global_agent_monotonic_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire + ret void +} + define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry @@ -2513,8 +2617,8 @@ ret void } -define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( -; GFX6-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2529,7 +2633,7 @@ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX7-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2546,7 +2650,7 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2563,7 +2667,7 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2580,7 +2684,7 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2594,7 +2698,7 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2607,7 +2711,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2622,12 +2726,12 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst ret void } -define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2636,32 +2740,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2670,14 +2772,15 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2686,14 +2789,15 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2702,47 +2806,45 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(1)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst ret void } -define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( +; GFX6-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2752,33 +2854,29 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2789,14 +2887,13 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2807,14 +2904,13 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2824,12 +2920,11 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2837,13 +2932,12 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2851,22 +2945,19 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(1)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") release seq_cst ret void } -define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2876,33 +2967,29 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2913,14 +3000,13 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2931,14 +3017,13 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2948,12 +3033,11 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2961,13 +3045,12 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2975,22 +3058,19 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(1)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst ret void } -define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2999,13 +3079,125 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3017,14 +3209,13 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3035,12 +3226,10 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3051,12 +3240,10 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3070,7 +3257,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3079,11 +3266,10 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3092,20 +3278,19 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( -; GFX6-LABEL: global_agent_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -3114,14 +3299,13 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3132,7 +3316,6 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -3141,7 +3324,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3150,8 +3333,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -3159,7 +3340,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3168,8 +3349,6 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv @@ -3177,7 +3356,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3186,34 +3365,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -3222,14 +3398,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") release acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -3241,11 +3417,10 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3258,14 +3433,13 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3278,12 +3452,10 @@ ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3296,12 +3468,10 @@ ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3316,7 +3486,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3326,11 +3496,10 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3340,20 +3509,19 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -3369,7 +3537,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3389,7 +3557,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3407,7 +3575,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3425,7 +3593,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3440,7 +3608,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3454,7 +3622,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3470,14 +3638,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( -; GFX6-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -3493,7 +3661,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3513,7 +3681,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3531,7 +3699,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3549,7 +3717,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3564,7 +3732,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3578,7 +3746,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3594,1560 +3762,3959 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_one_as_unordered_load( -; GFX6-LABEL: global_agent_one_as_unordered_load: +define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_unordered_load: +; GFX7-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_unordered_load: +; GFX10-WGP-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_agent_release_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_release_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_release_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_unordered_load( +; GFX6-LABEL: global_agent_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_one_as_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_one_as_monotonic_load( +; GFX6-LABEL: global_agent_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acquire_load( +; GFX6-LABEL: global_agent_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_load( +; GFX6-LABEL: global_agent_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_one_as_unordered_store( +; GFX6-LABEL: global_agent_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_one_as_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_monotonic_store( +; GFX6-LABEL: global_agent_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_release_store( +; GFX6-LABEL: global_agent_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_store( +; GFX6-LABEL: global_agent_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( +; GFX6-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( +; GFX6-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire + ret void +} + +define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( +; GFX6-LABEL: global_agent_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") release + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_unordered_load: +; GFX10-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_unordered_load: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_load: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_unordered_load: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") unordered, align 4 - store i32 %val, i32 addrspace(1)* %out + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel ret void } -define amdgpu_kernel void @global_agent_one_as_monotonic_load( -; GFX6-LABEL: global_agent_one_as_monotonic_load: +define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_monotonic_load: +; GFX7-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_monotonic_load: +; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_monotonic_load: +; GFX10-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_load: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_load: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") monotonic, align 4 - store i32 %val, i32 addrspace(1)* %out + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst ret void } -define amdgpu_kernel void @global_agent_one_as_acquire_load( -; GFX6-LABEL: global_agent_one_as_acquire_load: +define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_acquire_load: +; GFX7-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_acquire_load: +; GFX10-WGP-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_acquire_load: +; GFX10-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_load: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_load: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_load: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") acquire, align 4 - store i32 %val, i32 addrspace(1)* %out + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_one_as_seq_cst_load( -; GFX6-LABEL: global_agent_one_as_seq_cst_load: +define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_seq_cst_load: +; GFX7-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_load: +; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_seq_cst_load: +; GFX10-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_load: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_load: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") seq_cst, align 4 - store i32 %val, i32 addrspace(1)* %out + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_one_as_unordered_store( -; GFX6-LABEL: global_agent_one_as_unordered_store: +define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_unordered_store: +; GFX7-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_unordered_store: +; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_unordered_store: +; GFX10-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_unordered_store: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_store: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_unordered_store: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") unordered, align 4 + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_one_as_monotonic_store( -; GFX6-LABEL: global_agent_one_as_monotonic_store: +define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_monotonic_store: +; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_monotonic_store: +; GFX10-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_monotonic_store: +; GFX10-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_store: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_store: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") monotonic, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic ret void } -define amdgpu_kernel void @global_agent_one_as_release_store( -; GFX6-LABEL: global_agent_one_as_release_store: +define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_release_store: +; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_release_store: +; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_release_store: +; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_store: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_store: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_store: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") release, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic ret void } -define amdgpu_kernel void @global_agent_one_as_seq_cst_store( -; GFX6-LABEL: global_agent_one_as_seq_cst_store: +define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_seq_cst_store: +; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_store: +; GFX10-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_seq_cst_store: +; GFX10-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_store: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_store: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") seq_cst, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic ret void } -define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( -; GFX6-LABEL: global_agent_one_as_monotonic_atomicrmw: +define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") monotonic + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic ret void } -define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( -; GFX6-LABEL: global_agent_one_as_acquire_atomicrmw: +define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic ret void } -define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( -; GFX6-LABEL: global_agent_one_as_release_atomicrmw: +define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( +; GFX6-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_release_atomicrmw: +; GFX7-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_release_atomicrmw: +; GFX10-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_release_atomicrmw: +; GFX10-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") release + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire ret void } -define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( -; GFX6-LABEL: global_agent_one_as_acq_rel_atomicrmw: +define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire ret void } -define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( -; GFX6-LABEL: global_agent_one_as_seq_cst_atomicrmw: +define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; GFX10-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire ret void } -define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( -; GFX6-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire - store i32 %val, i32 addrspace(1)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire ret void } -define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( -; GFX6-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel - store i32 %val, i32 addrspace(1)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire ret void } -define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( -; GFX6-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst ret void } -define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( -; GFX6-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5156,10 +7723,13 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5170,10 +7740,13 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5182,10 +7755,15 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5194,10 +7772,15 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5206,37 +7789,45 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst ret void } -define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( -; GFX6-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( +; GFX6-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5245,12 +7836,13 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; GFX7-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5261,12 +7853,13 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5275,13 +7868,15 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5290,13 +7885,15 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5305,29 +7902,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -5335,12 +7935,12 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst ret void } -define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( -; GFX6-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5351,9 +7951,11 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5366,9 +7968,11 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5380,9 +7984,12 @@ ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5394,9 +8001,12 @@ ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5407,9 +8017,10 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5418,9 +8029,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5429,16 +8042,18 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst ret void } -define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( -; GFX6-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5453,7 +8068,7 @@ ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5470,7 +8085,7 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5487,7 +8102,7 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5504,7 +8119,7 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5518,7 +8133,7 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5531,7 +8146,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5546,12 +8161,12 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst ret void } -define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( -; GFX6-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5560,30 +8175,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5592,15 +8207,12 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5609,15 +8221,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5626,45 +8235,45 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( -; GFX6-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5673,28 +8282,32 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5703,13 +8316,14 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5718,13 +8332,14 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5733,42 +8348,47 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( -; GFX6-LABEL: global_agent_one_as_release_acquire_cmpxchg: +define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5778,29 +8398,33 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5811,13 +8435,14 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5828,13 +8453,14 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5844,11 +8470,12 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5856,12 +8483,13 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5869,19 +8497,22 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( -; GFX6-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5891,29 +8522,33 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5924,13 +8559,14 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5941,13 +8577,14 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5957,11 +8594,12 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5969,12 +8607,13 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5982,19 +8621,22 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( -; GFX6-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6003,30 +8645,32 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX7-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6035,15 +8679,14 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6052,15 +8695,14 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6069,45 +8711,47 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( -; GFX6-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6116,30 +8760,32 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6148,15 +8794,14 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6165,15 +8810,14 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6182,45 +8826,47 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6229,13 +8875,14 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6246,6 +8893,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -6254,7 +8902,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6263,6 +8911,8 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -6270,7 +8920,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6279,6 +8929,8 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv @@ -6286,7 +8938,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6295,31 +8947,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -6328,14 +8983,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6351,7 +9006,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6371,7 +9026,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6389,7 +9044,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6407,7 +9062,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6422,7 +9077,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6436,7 +9091,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6452,14 +9107,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6475,7 +9130,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6495,7 +9150,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6513,7 +9168,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6531,7 +9186,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6546,7 +9201,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6560,7 +9215,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6576,14 +9231,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6592,13 +9247,14 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6609,6 +9265,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -6617,7 +9274,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6626,6 +9283,8 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -6633,7 +9292,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6642,6 +9301,8 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv @@ -6649,7 +9310,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6658,31 +9319,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -6691,14 +9355,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( -; GFX6-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6714,7 +9378,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6734,7 +9398,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6752,7 +9416,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6770,7 +9434,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6785,7 +9449,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6799,7 +9463,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6815,14 +9479,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6838,7 +9502,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6858,7 +9522,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6876,7 +9540,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6894,7 +9558,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6909,7 +9573,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6923,7 +9587,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6939,14 +9603,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6962,7 +9626,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6982,7 +9646,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -7000,7 +9664,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -7018,7 +9682,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -7033,7 +9697,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -7047,7 +9711,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -7063,7 +9727,7 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -1845,6 +1845,95 @@ ret void } +define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( +; GFX6-LABEL: global_singlethread_monotonic_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire + ret void +} + define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry @@ -2201,8 +2290,8 @@ ret void } -define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( -; GFX6-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2214,7 +2303,7 @@ ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX7-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2228,7 +2317,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2240,7 +2329,7 @@ ; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2252,7 +2341,7 @@ ; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2264,7 +2353,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2274,7 +2363,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2286,12 +2375,12 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst ret void } -define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2300,30 +2389,24 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2332,12 +2415,10 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2346,12 +2427,10 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2360,45 +2439,37 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(1)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst ret void } -define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( +; GFX6-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2407,30 +2478,24 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2439,12 +2504,10 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2453,12 +2516,10 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2467,45 +2528,37 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(1)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst ret void } -define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2514,30 +2567,24 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2546,12 +2593,10 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2560,12 +2605,10 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2574,45 +2617,37 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(1)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst ret void } -define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2621,30 +2656,24 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2653,12 +2682,10 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2667,12 +2694,10 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2681,45 +2706,37 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(1)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst ret void } -define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( -; GFX6-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2733,7 +2750,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX7-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2751,7 +2768,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2765,7 +2782,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2779,7 +2796,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2793,7 +2810,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2805,7 +2822,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2819,14 +2836,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2840,7 +2857,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2858,7 +2875,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2872,7 +2889,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2886,7 +2903,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2900,7 +2917,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2912,7 +2929,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2926,14 +2943,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2947,7 +2964,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2965,7 +2982,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2979,7 +2996,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2993,7 +3010,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3007,7 +3024,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3019,7 +3036,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3033,14 +3050,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( -; GFX6-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -3054,7 +3071,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX7-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3072,7 +3089,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3086,7 +3103,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3100,7 +3117,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3114,7 +3131,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3126,7 +3143,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3140,1407 +3157,3652 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_one_as_unordered_load( -; GFX6-LABEL: global_singlethread_one_as_unordered_load: +define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_unordered_load: +; GFX7-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_unordered_load: +; GFX10-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_unordered_load: +; GFX10-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_unordered_load: +; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_load: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_unordered_load: +; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") unordered, align 4 - store i32 %val, i32 addrspace(1)* %out + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( -; GFX6-LABEL: global_singlethread_one_as_monotonic_load: +define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_monotonic_load: +; GFX7-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_load: +; GFX10-WGP-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_load: +; GFX10-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_unordered_load( +; GFX6-LABEL: global_singlethread_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( +; GFX6-LABEL: global_singlethread_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acquire_load( +; GFX6-LABEL: global_singlethread_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( +; GFX6-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_unordered_store( +; GFX6-LABEL: global_singlethread_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( +; GFX6-LABEL: global_singlethread_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_release_store( +; GFX6-LABEL: global_singlethread_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( +; GFX6-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( +; GFX6-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( +; GFX6-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( +; GFX6-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") release + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_load: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_load: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") monotonic, align 4 - store i32 %val, i32 addrspace(1)* %out + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_one_as_acquire_load( -; GFX6-LABEL: global_singlethread_one_as_acquire_load: +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_acquire_load: +; GFX7-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_load: +; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_acquire_load: +; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_load: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_load: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_load: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") acquire, align 4 - store i32 %val, i32 addrspace(1)* %out + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( -; GFX6-LABEL: global_singlethread_one_as_seq_cst_load: +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX7-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_load: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") seq_cst, align 4 - store i32 %val, i32 addrspace(1)* %out + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_one_as_unordered_store( -; GFX6-LABEL: global_singlethread_one_as_unordered_store: +define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_unordered_store: +; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_unordered_store: +; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_unordered_store: +; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_unordered_store: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_store: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_unordered_store: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") unordered, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic ret void } -define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( -; GFX6-LABEL: global_singlethread_one_as_monotonic_store: +define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_monotonic_store: +; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_store: +; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_store: +; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_store: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_store: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") monotonic, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic ret void } -define amdgpu_kernel void @global_singlethread_one_as_release_store( -; GFX6-LABEL: global_singlethread_one_as_release_store: +define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_release_store: +; GFX7-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_release_store: +; GFX10-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_release_store: +; GFX10-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_store: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_store: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_store: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") release, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic ret void } -define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( -; GFX6-LABEL: global_singlethread_one_as_seq_cst_store: +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_store: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") seq_cst, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic ret void } -define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( -; GFX6-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") monotonic + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic ret void } -define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( -; GFX6-LABEL: global_singlethread_one_as_acquire_atomicrmw: +define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acquire + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire ret void } -define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( -; GFX6-LABEL: global_singlethread_one_as_release_atomicrmw: +define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") release + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire ret void } -define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( -; GFX6-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acq_rel + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire ret void } -define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( -; GFX6-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") seq_cst + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire ret void } -define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( -; GFX6-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acquire - store i32 %val, i32 addrspace(1)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire ret void } -define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( -; GFX6-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acq_rel - store i32 %val, i32 addrspace(1)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst ret void } -define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( -; GFX6-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst ret void } -define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxchg( -; GFX6-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -4552,7 +6814,7 @@ ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4566,7 +6828,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4578,7 +6840,7 @@ ; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4590,7 +6852,7 @@ ; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4602,7 +6864,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4612,7 +6874,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4624,12 +6886,12 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst ret void } -define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( -; GFX6-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -4641,7 +6903,7 @@ ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4655,7 +6917,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4667,7 +6929,7 @@ ; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4679,7 +6941,7 @@ ; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4691,7 +6953,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4701,7 +6963,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4713,12 +6975,12 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst ret void } -define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( -; GFX6-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -4730,7 +6992,7 @@ ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4744,7 +7006,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4756,7 +7018,7 @@ ; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4768,7 +7030,7 @@ ; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4780,7 +7042,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4790,7 +7052,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4802,12 +7064,12 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst ret void } -define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( -; GFX6-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -4816,24 +7078,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4842,10 +7110,12 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4854,10 +7124,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4866,37 +7138,45 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( -; GFX6-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -4905,24 +7185,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4931,10 +7217,12 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4943,10 +7231,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4955,37 +7245,45 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( -; GFX6-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -4994,24 +7292,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX7-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5020,10 +7324,12 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5032,10 +7338,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5044,37 +7352,45 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( -; GFX6-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5083,24 +7399,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5109,10 +7431,12 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5121,10 +7445,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5133,37 +7459,45 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( -; GFX6-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5172,24 +7506,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5198,10 +7538,12 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5210,10 +7552,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5222,37 +7566,45 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( -; GFX6-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5261,24 +7613,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5287,10 +7645,12 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5299,10 +7659,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5311,37 +7673,45 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( -; GFX6-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5350,24 +7720,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5376,10 +7752,12 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5388,10 +7766,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5400,37 +7780,45 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5444,7 +7832,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5462,7 +7850,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5476,7 +7864,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5490,7 +7878,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5504,7 +7892,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5516,7 +7904,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5530,14 +7918,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5551,7 +7939,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5569,7 +7957,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5583,7 +7971,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5597,7 +7985,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5611,7 +7999,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5623,7 +8011,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5637,14 +8025,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5658,7 +8046,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5676,7 +8064,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5690,7 +8078,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5704,7 +8092,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5718,7 +8106,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5730,7 +8118,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5744,14 +8132,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5765,7 +8153,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5783,7 +8171,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5797,7 +8185,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5811,7 +8199,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5825,7 +8213,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5837,7 +8225,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5851,14 +8239,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxchg( -; GFX6-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5872,7 +8260,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5890,7 +8278,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5904,7 +8292,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5918,7 +8306,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5932,7 +8320,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5944,7 +8332,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5958,14 +8346,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5979,7 +8367,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5997,7 +8385,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6011,7 +8399,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6025,7 +8413,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6039,7 +8427,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6051,7 +8439,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6065,14 +8453,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6086,7 +8474,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6104,7 +8492,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6118,7 +8506,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6132,7 +8520,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6146,7 +8534,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6158,7 +8546,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6172,7 +8560,7 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -2112,6 +2112,112 @@ ret void } +define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( +; GFX6-LABEL: global_system_monotonic_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_system_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_system_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in monotonic acquire + ret void +} + define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry @@ -2686,8 +2792,8 @@ ret void } -define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2698,11 +2804,10 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2714,14 +2819,13 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2732,12 +2836,10 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2748,12 +2850,10 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2767,7 +2867,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2776,12 +2876,10 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2790,21 +2888,19 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2813,14 +2909,13 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2831,7 +2926,6 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -2840,7 +2934,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2849,8 +2943,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -2858,7 +2950,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2867,8 +2959,6 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv @@ -2876,7 +2966,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2885,21 +2975,18 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 @@ -2907,15 +2994,13 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 @@ -2925,14 +3010,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2948,7 +3033,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2968,7 +3053,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2986,7 +3071,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3004,7 +3089,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3019,7 +3104,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3035,7 +3120,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3053,14 +3138,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: global_system_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -3069,13 +3154,14 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3086,6 +3172,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -3094,7 +3181,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3103,6 +3190,8 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -3110,7 +3199,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3119,6 +3208,8 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv @@ -3126,7 +3217,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3135,18 +3226,21 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 @@ -3154,13 +3248,15 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 @@ -3170,14 +3266,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( -; GFX6-LABEL: global_system_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -3186,14 +3282,13 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg: +; GFX7-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3204,7 +3299,6 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -3213,7 +3307,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3222,8 +3316,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -3231,7 +3323,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3240,8 +3332,6 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv @@ -3249,7 +3339,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3258,21 +3348,18 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 @@ -3280,15 +3367,13 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 @@ -3298,14 +3383,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in release acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -3314,14 +3399,13 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3332,7 +3416,6 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -3341,7 +3424,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3350,8 +3433,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -3359,7 +3440,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3368,8 +3449,6 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv @@ -3377,7 +3456,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3386,21 +3465,18 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 @@ -3408,15 +3484,13 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 @@ -3426,14 +3500,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -3449,7 +3523,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3469,7 +3543,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3487,7 +3561,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3505,7 +3579,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3520,7 +3594,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3536,7 +3610,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3554,14 +3628,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( -; GFX6-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -3577,7 +3651,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3597,7 +3671,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3615,7 +3689,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3633,7 +3707,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3648,7 +3722,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3664,7 +3738,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3682,1590 +3756,3684 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_one_as_unordered_load( -; GFX6-LABEL: global_system_one_as_unordered_load: +define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_unordered_load: +; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_unordered_load: +; GFX10-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_unordered_load: +; GFX10-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_unordered_load: +; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_unordered_load: +; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_unordered_load: +; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") unordered, align 4 - store i32 %val, i32 addrspace(1)* %out + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_one_as_monotonic_load( -; GFX6-LABEL: global_system_one_as_monotonic_load: +define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_monotonic_load: +; GFX7-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_monotonic_load: +; GFX10-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_monotonic_load: +; GFX10-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_load: +; SKIP-CACHE-INV-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_system_relese_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_system_relese_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_system_relese_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_unordered_load( +; GFX6-LABEL: global_system_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_system_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_system_one_as_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_one_as_monotonic_load( +; GFX6-LABEL: global_system_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_system_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_one_as_acquire_load( +; GFX6-LABEL: global_system_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_system_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_load( +; GFX6-LABEL: global_system_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_system_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_one_as_unordered_store( +; GFX6-LABEL: global_system_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_system_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_system_one_as_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_monotonic_store( +; GFX6-LABEL: global_system_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_system_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_release_store( +; GFX6-LABEL: global_system_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_system_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_system_one_as_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_store( +; GFX6-LABEL: global_system_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_system_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( +; GFX6-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( +; GFX6-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire + ret void +} + +define amdgpu_kernel void @global_system_one_as_release_atomicrmw( +; GFX6-LABEL: global_system_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_system_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_system_one_as_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") release + ret void +} + +define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_load: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") monotonic, align 4 - store i32 %val, i32 addrspace(1)* %out + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst ret void } -define amdgpu_kernel void @global_system_one_as_acquire_load( -; GFX6-LABEL: global_system_one_as_acquire_load: +define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_acquire_load: +; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_acquire_load: +; GFX10-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_acquire_load: +; GFX10-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_load: +; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_load: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_load: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") acquire, align 4 - store i32 %val, i32 addrspace(1)* %out + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_one_as_seq_cst_load( -; GFX6-LABEL: global_system_one_as_seq_cst_load: +define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_seq_cst_load: +; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_seq_cst_load: +; GFX10-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_seq_cst_load: +; GFX10-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_load: +; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_load: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") seq_cst, align 4 - store i32 %val, i32 addrspace(1)* %out + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_one_as_unordered_store( -; GFX6-LABEL: global_system_one_as_unordered_store: +define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_unordered_store: +; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_unordered_store: +; GFX10-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_unordered_store: +; GFX10-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_unordered_store: +; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_unordered_store: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_unordered_store: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") unordered, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic ret void } -define amdgpu_kernel void @global_system_one_as_monotonic_store( -; GFX6-LABEL: global_system_one_as_monotonic_store: +define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_monotonic_store: +; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_monotonic_store: +; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_monotonic_store: +; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_store: +; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_store: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") monotonic, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic ret void } -define amdgpu_kernel void @global_system_one_as_release_store( -; GFX6-LABEL: global_system_one_as_release_store: +define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_release_store: +; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_release_store: +; GFX10-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_release_store: +; GFX10-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_release_store: +; SKIP-CACHE-INV-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_store: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_store: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") release, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic ret void } -define amdgpu_kernel void @global_system_one_as_seq_cst_store( -; GFX6-LABEL: global_system_one_as_seq_cst_store: +define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_seq_cst_store: +; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_seq_cst_store: +; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_seq_cst_store: +; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_store: +; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_store: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") seq_cst, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic ret void } -define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( -; GFX6-LABEL: global_system_one_as_monotonic_atomicrmw: +define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") monotonic + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic ret void } -define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( -; GFX6-LABEL: global_system_one_as_acquire_atomicrmw: +define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( +; GFX6-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX7-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX10-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX10-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire ret void } -define amdgpu_kernel void @global_system_one_as_release_atomicrmw( -; GFX6-LABEL: global_system_one_as_release_atomicrmw: +define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_release_atomicrmw: +; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_release_atomicrmw: +; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_release_atomicrmw: +; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_release_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") release + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire ret void } -define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( -; GFX6-LABEL: global_system_one_as_acq_rel_atomicrmw: +define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX10-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX10-CU-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire ret void } -define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( -; GFX6-LABEL: global_system_one_as_seq_cst_atomicrmw: +define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire ret void } -define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( -; GFX6-LABEL: global_system_one_as_acquire_ret_atomicrmw: +define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire - store i32 %val, i32 addrspace(1)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire ret void } -define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( -; GFX6-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel - store i32 %val, i32 addrspace(1)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst ret void } -define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( -; GFX6-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX7-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst ret void } -define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( -; GFX6-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( +; GFX6-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5274,10 +7442,13 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; GFX7-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5288,10 +7459,13 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5300,10 +7474,15 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5312,10 +7491,15 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5324,37 +7508,49 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst ret void } -define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( -; GFX6-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5363,12 +7559,13 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5379,12 +7576,13 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5393,13 +7591,15 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5408,13 +7608,15 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5423,30 +7625,35 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 @@ -5455,12 +7662,12 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst ret void } -define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( -; GFX6-LABEL: global_system_one_as_release_monotonic_cmpxchg: +define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5471,9 +7678,11 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5486,9 +7695,11 @@ ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5500,9 +7711,12 @@ ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5514,9 +7728,12 @@ ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5527,9 +7744,10 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5539,9 +7757,12 @@ ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5551,16 +7772,19 @@ ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst ret void } -define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( -; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5569,30 +7793,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX7-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5601,15 +7825,12 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5618,15 +7839,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5635,49 +7853,45 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( -; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5686,30 +7900,32 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5718,15 +7934,14 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5735,15 +7950,14 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5752,49 +7966,49 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( -; GFX6-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5803,28 +8017,32 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; GFX7-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5833,13 +8051,14 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; GFX10-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5848,13 +8067,14 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5863,44 +8083,50 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( -; GFX6-LABEL: global_system_one_as_release_acquire_cmpxchg: +define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5910,29 +8136,33 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg: +; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5943,13 +8173,14 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_release_acquire_cmpxchg: +; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5960,13 +8191,14 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5976,11 +8208,12 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5989,13 +8222,14 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6004,20 +8238,23 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( -; GFX6-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6027,29 +8264,33 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6060,13 +8301,14 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6077,13 +8319,14 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6093,11 +8336,12 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6106,13 +8350,14 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6121,20 +8366,23 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( -; GFX6-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6143,30 +8391,32 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; GFX7-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6175,15 +8425,14 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6192,15 +8441,14 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6209,49 +8457,49 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( -; GFX6-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6260,30 +8508,32 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6292,15 +8542,14 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6309,15 +8558,14 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6326,49 +8574,49 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6377,13 +8625,14 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6394,6 +8643,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -6402,7 +8652,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6411,6 +8661,8 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -6418,7 +8670,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6427,6 +8679,8 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv @@ -6434,7 +8688,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6443,18 +8697,21 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 @@ -6462,13 +8719,15 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 @@ -6478,14 +8737,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6501,7 +8760,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6521,7 +8780,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6539,7 +8798,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6557,7 +8816,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6572,7 +8831,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6588,7 +8847,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6606,14 +8865,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6629,7 +8888,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6649,7 +8908,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6667,7 +8926,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6685,7 +8944,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6700,7 +8959,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6716,7 +8975,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6734,14 +8993,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6750,13 +9009,14 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6767,6 +9027,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol @@ -6775,7 +9036,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6784,6 +9045,8 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -6791,7 +9054,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6800,6 +9063,8 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv @@ -6807,7 +9072,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6816,18 +9081,21 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 @@ -6835,13 +9103,15 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 @@ -6851,14 +9121,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( -; GFX6-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6874,7 +9144,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; GFX7-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6894,7 +9164,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6912,7 +9182,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6930,7 +9200,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6945,7 +9215,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6961,7 +9231,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6979,14 +9249,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -7002,7 +9272,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -7022,7 +9292,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -7040,7 +9310,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -7058,7 +9328,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -7073,7 +9343,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -7089,7 +9359,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -7107,14 +9377,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -7130,7 +9400,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -7150,7 +9420,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -7168,7 +9438,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -7186,7 +9456,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -7201,7 +9471,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -7217,7 +9487,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -7235,7 +9505,7 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -1845,6 +1845,95 @@ ret void } +define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( +; GFX6-LABEL: global_wavefront_monotonic_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire + ret void +} + define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry @@ -2201,8 +2290,8 @@ ret void } -define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( -; GFX6-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2214,7 +2303,7 @@ ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX7-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2228,7 +2317,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2240,7 +2329,7 @@ ; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2252,7 +2341,7 @@ ; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2264,7 +2353,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2274,7 +2363,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2286,12 +2375,12 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst ret void } -define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2300,30 +2389,24 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2332,12 +2415,10 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2346,12 +2427,10 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2360,45 +2439,37 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(1)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst ret void } -define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( +; GFX6-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2407,30 +2478,24 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2439,12 +2504,10 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2453,12 +2516,10 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2467,45 +2528,37 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(1)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst ret void } -define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2514,30 +2567,24 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2546,12 +2593,10 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2560,12 +2605,10 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2574,45 +2617,37 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(1)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst ret void } -define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2621,30 +2656,24 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2653,12 +2682,10 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2667,12 +2694,10 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2681,45 +2706,37 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(1)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst ret void } -define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( -; GFX6-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2733,7 +2750,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX7-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2751,7 +2768,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2765,7 +2782,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2779,7 +2796,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2793,7 +2810,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2805,7 +2822,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2816,17 +2833,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in, i32 %old) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2840,7 +2857,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2858,7 +2875,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2872,7 +2889,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2886,7 +2903,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2900,7 +2917,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2912,7 +2929,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2926,14 +2943,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2947,7 +2964,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2965,7 +2982,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2979,7 +2996,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2993,7 +3010,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3007,7 +3024,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3019,7 +3036,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3030,17 +3047,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in, i32 %old) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( -; GFX6-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -3054,7 +3071,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3072,7 +3089,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3086,7 +3103,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3100,7 +3117,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3114,7 +3131,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3126,7 +3143,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3140,1407 +3157,3652 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_one_as_unordered_load( -; GFX6-LABEL: global_wavefront_one_as_unordered_load: +define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_unordered_load: +; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_unordered_load: +; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_unordered_load: +; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_unordered_load: +; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_load: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_unordered_load: +; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") unordered, align 4 - store i32 %val, i32 addrspace(1)* %out + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( -; GFX6-LABEL: global_wavefront_one_as_monotonic_load: +define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_monotonic_load: +; GFX7-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_load: +; GFX10-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_unordered_load( +; GFX6-LABEL: global_wavefront_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( +; GFX6-LABEL: global_wavefront_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acquire_load( +; GFX6-LABEL: global_wavefront_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( +; GFX6-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_unordered_store( +; GFX6-LABEL: global_wavefront_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( +; GFX6-LABEL: global_wavefront_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_release_store( +; GFX6-LABEL: global_wavefront_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( +; GFX6-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( +; GFX6-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( +; GFX6-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( +; GFX6-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") release + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_load: +; GFX10-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_load: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_load: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") monotonic, align 4 - store i32 %val, i32 addrspace(1)* %out + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_one_as_acquire_load( -; GFX6-LABEL: global_wavefront_one_as_acquire_load: +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_acquire_load: +; GFX7-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_load: +; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_acquire_load: +; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_load: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_load: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_load: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") acquire, align 4 - store i32 %val, i32 addrspace(1)* %out + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( -; GFX6-LABEL: global_wavefront_one_as_seq_cst_load: +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX7-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_load: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") seq_cst, align 4 - store i32 %val, i32 addrspace(1)* %out + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_one_as_unordered_store( -; GFX6-LABEL: global_wavefront_one_as_unordered_store: +define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_unordered_store: +; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_unordered_store: +; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_unordered_store: +; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_unordered_store: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_store: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_unordered_store: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") unordered, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic ret void } -define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( -; GFX6-LABEL: global_wavefront_one_as_monotonic_store: +define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_monotonic_store: +; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_store: +; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_store: +; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_store: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_store: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") monotonic, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic ret void } -define amdgpu_kernel void @global_wavefront_one_as_release_store( -; GFX6-LABEL: global_wavefront_one_as_release_store: +define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_release_store: +; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_release_store: +; GFX10-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_release_store: +; GFX10-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_store: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_store: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_store: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") release, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic ret void } -define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( -; GFX6-LABEL: global_wavefront_one_as_seq_cst_store: +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_store: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") seq_cst, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic ret void } -define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( -; GFX6-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") monotonic + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic ret void } -define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( -; GFX6-LABEL: global_wavefront_one_as_acquire_atomicrmw: +define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acquire + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire ret void } -define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( -; GFX6-LABEL: global_wavefront_one_as_release_atomicrmw: +define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") release + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire ret void } -define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( -; GFX6-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acq_rel + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire ret void } -define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( -; GFX6-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") seq_cst + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire ret void } -define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( -; GFX6-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acquire - store i32 %val, i32 addrspace(1)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire ret void } -define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( -; GFX6-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acq_rel - store i32 %val, i32 addrspace(1)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst ret void } -define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( -; GFX6-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst ret void } -define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( -; GFX6-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -4552,7 +6814,7 @@ ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4566,7 +6828,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4578,7 +6840,7 @@ ; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4590,7 +6852,7 @@ ; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4602,7 +6864,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4612,7 +6874,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4624,12 +6886,12 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst ret void } -define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( -; GFX6-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -4641,7 +6903,7 @@ ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4655,7 +6917,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4667,7 +6929,7 @@ ; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4679,7 +6941,7 @@ ; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4691,7 +6953,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4701,7 +6963,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4713,12 +6975,12 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst ret void } -define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( -; GFX6-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -4730,7 +6992,7 @@ ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4744,7 +7006,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4756,7 +7018,7 @@ ; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4768,7 +7030,7 @@ ; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4780,7 +7042,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4790,7 +7052,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4802,12 +7064,12 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst ret void } -define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( -; GFX6-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -4816,24 +7078,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4842,10 +7110,12 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4854,10 +7124,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4866,37 +7138,45 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( -; GFX6-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -4905,24 +7185,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4931,10 +7217,12 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4943,10 +7231,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4955,37 +7245,45 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( -; GFX6-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -4994,24 +7292,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX7-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5020,10 +7324,12 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5032,10 +7338,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5044,37 +7352,45 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( -; GFX6-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5083,24 +7399,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5109,10 +7431,12 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5121,10 +7445,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5133,37 +7459,45 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( -; GFX6-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5172,24 +7506,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5198,10 +7538,12 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5210,10 +7552,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5222,37 +7566,45 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( -; GFX6-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5261,24 +7613,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5287,10 +7645,12 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5299,10 +7659,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5311,37 +7673,45 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( -; GFX6-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5350,24 +7720,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5376,10 +7752,12 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5388,10 +7766,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5400,37 +7780,45 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5444,7 +7832,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5462,7 +7850,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5476,7 +7864,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5490,7 +7878,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5504,7 +7892,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5516,7 +7904,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5530,14 +7918,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5551,7 +7939,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5569,7 +7957,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5583,7 +7971,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5597,7 +7985,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5611,7 +7999,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5623,7 +8011,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5637,14 +8025,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5658,7 +8046,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5676,7 +8064,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5690,7 +8078,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5704,7 +8092,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5718,7 +8106,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5730,7 +8118,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5744,14 +8132,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5765,7 +8153,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5783,7 +8171,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5797,7 +8185,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5811,7 +8199,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5825,7 +8213,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5837,7 +8225,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5851,14 +8239,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( -; GFX6-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5872,7 +8260,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5890,7 +8278,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5904,7 +8292,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5918,7 +8306,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5932,7 +8320,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5944,7 +8332,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5958,14 +8346,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5979,7 +8367,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5997,7 +8385,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6011,7 +8399,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6025,7 +8413,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6039,7 +8427,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6051,7 +8439,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6065,14 +8453,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6086,7 +8474,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6104,7 +8492,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6118,7 +8506,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6132,7 +8520,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6146,7 +8534,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6158,7 +8546,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6172,7 +8560,7 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -1963,6 +1963,99 @@ ret void } +define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( +; GFX6-LABEL: global_workgroup_monotonic_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire + ret void +} + define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry @@ -2359,8 +2452,8 @@ ret void } -define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( -; GFX6-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2373,7 +2466,7 @@ ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX7-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2388,7 +2481,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2404,7 +2497,7 @@ ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2417,7 +2510,7 @@ ; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2430,7 +2523,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2441,7 +2534,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2456,12 +2549,12 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic seq_cst ret void } -define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2470,30 +2563,26 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2502,13 +2591,14 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2517,12 +2607,11 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2531,46 +2620,42 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(1)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire seq_cst ret void } -define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( +; GFX6-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2580,31 +2665,25 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2615,13 +2694,12 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2631,12 +2709,10 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2646,12 +2722,10 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2659,12 +2733,10 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2672,22 +2744,19 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(1)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") release seq_cst ret void } -define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2697,31 +2766,25 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s4, s0, 16 -; GFX7-NEXT: s_addc_u32 s5, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2732,13 +2795,12 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2748,12 +2810,10 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2763,12 +2823,10 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2776,12 +2834,10 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2789,22 +2845,19 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(1)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel seq_cst ret void } -define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2813,12 +2866,113 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_endpgm +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2836,7 +2990,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2847,11 +3001,10 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2865,7 +3018,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2879,7 +3032,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2891,7 +3044,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -2900,20 +3053,19 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( -; GFX6-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -2922,13 +3074,12 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -2939,7 +3090,6 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 @@ -2947,7 +3097,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2956,15 +3106,13 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2973,13 +3121,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -2988,33 +3135,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -3023,14 +3167,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -3045,7 +3189,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3064,7 +3208,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3077,11 +3221,10 @@ ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3096,7 +3239,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3111,7 +3254,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3124,7 +3267,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3134,20 +3277,19 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -3162,7 +3304,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3181,7 +3323,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3198,7 +3340,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3213,7 +3355,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3228,7 +3370,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3241,7 +3383,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3257,14 +3399,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( -; GFX6-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -3279,7 +3421,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -3298,7 +3440,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3315,7 +3457,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -3330,7 +3472,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -3345,7 +3487,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3358,7 +3500,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -3374,1451 +3516,3737 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_one_as_unordered_load( -; GFX6-LABEL: global_workgroup_one_as_unordered_load: +define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_unordered_load: +; GFX7-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_unordered_load: +; GFX10-WGP-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_unordered_load: +; GFX10-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_unordered_load: +; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_load: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_unordered_load( +; GFX6-LABEL: global_workgroup_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( +; GFX6-LABEL: global_workgroup_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acquire_load( +; GFX6-LABEL: global_workgroup_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( +; GFX6-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 +; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_unordered_store( +; GFX6-LABEL: global_workgroup_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( +; GFX6-LABEL: global_workgroup_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_release_store( +; GFX6-LABEL: global_workgroup_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( +; GFX6-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( +; GFX6-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( +; GFX6-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( +; GFX6-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") release + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_unordered_load: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") unordered, align 4 - store i32 %val, i32 addrspace(1)* %out + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel ret void } -define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( -; GFX6-LABEL: global_workgroup_one_as_monotonic_load: +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_monotonic_load: +; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_load: +; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_load: +; GFX10-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_load: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_load: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") monotonic, align 4 - store i32 %val, i32 addrspace(1)* %out + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_one_as_acquire_load( -; GFX6-LABEL: global_workgroup_one_as_acquire_load: +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_acquire_load: +; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_load: +; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_acquire_load: +; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_load: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_load: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_load: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") acquire, align 4 - store i32 %val, i32 addrspace(1)* %out + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( -; GFX6-LABEL: global_workgroup_one_as_seq_cst_load: +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s0, s4 -; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_mov_b32 s4, s6 -; GFX6-NEXT: s_mov_b32 s5, s7 -; GFX6-NEXT: s_mov_b32 s6, s2 -; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_load_dword v0, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] +; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_load: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 -; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in) { entry: - %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") seq_cst, align 4 - store i32 %val, i32 addrspace(1)* %out + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_one_as_unordered_store( -; GFX6-LABEL: global_workgroup_one_as_unordered_store: +define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_unordered_store: +; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_unordered_store: +; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_unordered_store: +; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_unordered_store: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_store: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_unordered_store: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") unordered, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic ret void } -define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( -; GFX6-LABEL: global_workgroup_one_as_monotonic_store: +define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_monotonic_store: +; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_store: +; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_store: +; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_store: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_store: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") monotonic, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic ret void } -define amdgpu_kernel void @global_workgroup_one_as_release_store( -; GFX6-LABEL: global_workgroup_one_as_release_store: +define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_release_store: +; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_release_store: +; GFX10-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_release_store: +; GFX10-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_store: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_store: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_store: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") release, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic ret void } -define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( -; GFX6-LABEL: global_workgroup_one_as_seq_cst_store: +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_store: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(1)* %out) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") seq_cst, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic ret void } -define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( -; GFX6-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") monotonic + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic ret void } -define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( -; GFX6-LABEL: global_workgroup_one_as_acquire_atomicrmw: +define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire ret void } -define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( -; GFX6-LABEL: global_workgroup_one_as_release_atomicrmw: +define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") release + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire ret void } -define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( -; GFX6-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire ret void } -define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( -; GFX6-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire ret void } -define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( -; GFX6-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire - store i32 %val, i32 addrspace(1)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire ret void } -define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( -; GFX6-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel - store i32 %val, i32 addrspace(1)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst ret void } -define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( -; GFX6-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s4, s[4:5], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 ; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(1)* %out, i32 %in) { + i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst ret void } -define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( -; GFX6-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -4830,7 +7258,7 @@ ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4844,7 +7272,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4853,10 +7281,14 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4868,7 +7300,7 @@ ; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4880,7 +7312,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4890,24 +7322,27 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst ret void } -define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( -; GFX6-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -4919,7 +7354,7 @@ ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -4933,7 +7368,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4942,12 +7377,14 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -4959,7 +7396,7 @@ ; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -4971,7 +7408,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -4981,13 +7418,14 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -4995,12 +7433,12 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst ret void } -define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( -; GFX6-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5012,7 +7450,7 @@ ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5026,7 +7464,7 @@ ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5038,9 +7476,11 @@ ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5052,7 +7492,7 @@ ; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5064,7 +7504,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5074,7 +7514,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5083,16 +7523,18 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst ret void } -define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( -; GFX6-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5101,24 +7543,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5127,14 +7575,12 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5143,10 +7589,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5155,40 +7603,45 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( -; GFX6-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5197,24 +7650,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5223,14 +7682,13 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5239,10 +7697,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5251,40 +7711,46 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( -; GFX6-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5293,24 +7759,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX7-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5319,12 +7791,14 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5333,10 +7807,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5345,39 +7821,46 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( -; GFX6-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5386,24 +7869,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5414,12 +7903,13 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5428,10 +7918,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5440,20 +7932,24 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5461,19 +7957,22 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( -; GFX6-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5482,24 +7981,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5510,12 +8015,13 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5524,10 +8030,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5536,20 +8044,24 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5557,19 +8069,22 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( -; GFX6-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5578,24 +8093,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5604,14 +8125,13 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5620,10 +8140,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5632,40 +8154,46 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( -; GFX6-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5674,24 +8202,30 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_u32 s0, s0, 16 -; GFX7-NEXT: s_addc_u32 s1, s1, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5700,14 +8234,13 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5716,10 +8249,12 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5728,40 +8263,46 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5775,7 +8316,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5793,7 +8334,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5802,13 +8343,15 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5822,7 +8365,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5836,7 +8379,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5848,13 +8391,14 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -5863,14 +8407,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5884,7 +8428,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -5902,7 +8446,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5919,7 +8463,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -5933,7 +8477,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -5947,7 +8491,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5959,7 +8503,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -5975,14 +8519,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -5996,7 +8540,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6014,7 +8558,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6031,7 +8575,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6045,7 +8589,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6059,7 +8603,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6071,7 +8615,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6087,14 +8631,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6108,7 +8652,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6126,7 +8670,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6135,13 +8679,15 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6155,7 +8701,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6169,7 +8715,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6181,13 +8727,14 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -6196,14 +8743,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( -; GFX6-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6217,7 +8764,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6235,7 +8782,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6252,7 +8799,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6266,7 +8813,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6280,7 +8827,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6292,7 +8839,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6308,14 +8855,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6329,7 +8876,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6347,7 +8894,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6364,7 +8911,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6378,7 +8925,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6392,7 +8939,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6404,7 +8951,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6420,14 +8967,14 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void } -define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -6441,7 +8988,7 @@ ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 @@ -6459,7 +9006,7 @@ ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6476,7 +9023,7 @@ ; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -6490,7 +9037,7 @@ ; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb @@ -6504,7 +9051,7 @@ ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6516,7 +9063,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 @@ -6532,7 +9079,7 @@ i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(1)* %out, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -1836,6 +1836,96 @@ ret void } +define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( +; GFX6-LABEL: local_agent_monotonic_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire + ret void +} + define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_agent_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry @@ -2220,8 +2310,8 @@ ret void } -define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( -; GFX6-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2235,7 +2325,7 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; GFX7-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2248,7 +2338,7 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2262,7 +2352,7 @@ ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2274,7 +2364,7 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2288,7 +2378,7 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2300,7 +2390,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2314,12 +2404,12 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst ret void } -define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2328,12 +2418,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2341,37 +2431,38 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2380,46 +2471,43 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst ret void } -define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( +; GFX6-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2429,12 +2517,11 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2443,12 +2530,11 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2457,13 +2543,12 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2471,12 +2556,11 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2486,12 +2570,11 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2499,12 +2582,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2512,22 +2594,18 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") release seq_cst ret void } -define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2537,12 +2615,11 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2551,12 +2628,11 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2565,13 +2641,12 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2579,12 +2654,11 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2594,12 +2668,11 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2607,12 +2680,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2620,22 +2692,18 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst ret void } -define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: local_agent_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2644,12 +2712,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2657,37 +2725,38 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2696,46 +2765,43 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst ret void } -define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( -; GFX6-LABEL: local_agent_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2744,13 +2810,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_release_acquire_ret_cmpxchg: +; GFX7-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2758,41 +2823,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2801,49 +2861,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") release acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2852,13 +2908,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2866,41 +2921,37 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2909,33 +2960,30 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2944,14 +2992,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2966,7 +3014,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2980,7 +3028,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2991,11 +3039,10 @@ ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3008,7 +3055,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -3023,7 +3070,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3036,7 +3083,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3045,21 +3092,20 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( -; GFX6-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -3074,7 +3120,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX7-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -3088,7 +3134,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3103,7 +3149,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3116,7 +3162,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -3131,7 +3177,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3144,7 +3190,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3160,1287 +3206,3443 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_unordered_load( -; GFX6-LABEL: local_agent_one_as_unordered_load: +define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_unordered_load: +; GFX7-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_unordered_load: +; GFX10-WGP-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_unordered_load: +; GFX10-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_unordered_load: +; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_load: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_unordered_load: +; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") unordered, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_monotonic_load( -; GFX6-LABEL: local_agent_one_as_monotonic_load: +define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_monotonic_load: +; GFX7-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_monotonic_load: +; GFX10-WGP-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_monotonic_load: +; GFX10-CU-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_load: +; SKIP-CACHE-INV-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_load: +; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") monotonic, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_acquire_load( -; GFX6-LABEL: local_agent_one_as_acquire_load: +define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_acquire_load: +; GFX7-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_acquire_load: +; GFX10-WGP-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_acquire_load: +; GFX10-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_load: +; SKIP-CACHE-INV-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_load: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_load: +; GFX90A-TGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") acquire, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_seq_cst_load( -; GFX6-LABEL: local_agent_one_as_seq_cst_load: +define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_seq_cst_load: +; GFX7-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_load: +; GFX10-WGP-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_seq_cst_load: +; GFX10-CU-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_load: +; SKIP-CACHE-INV-LABEL: local_agent_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_load: +; GFX90A-TGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") seq_cst, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_unordered_store( -; GFX6-LABEL: local_agent_one_as_unordered_store: +define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_unordered_store: +; GFX7-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_unordered_store: +; GFX10-WGP-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_unordered_store: +; GFX10-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_unordered_store: +; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_store: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_unordered_store: +; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") unordered, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_monotonic_store( -; GFX6-LABEL: local_agent_one_as_monotonic_store: +define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_monotonic_store: +; GFX7-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_monotonic_store: +; GFX10-WGP-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_monotonic_store: +; GFX10-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_store: +; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_store: +; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") monotonic, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_release_store( -; GFX6-LABEL: local_agent_one_as_release_store: +define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_release_store: +; GFX7-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_release_store: +; GFX10-WGP-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_release_store: +; GFX10-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_store: +; SKIP-CACHE-INV-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_store: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_store: +; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") release, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_seq_cst_store( -; GFX6-LABEL: local_agent_one_as_seq_cst_store: +define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_seq_cst_store: +; GFX7-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_store: +; GFX10-WGP-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 -; GFX10-WGP-NEXT: s_endpgm -; -; GFX10-CU-LABEL: local_agent_one_as_seq_cst_store: +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_store: +; SKIP-CACHE-INV-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_store: +; GFX90A-TGSPLIT-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") seq_cst, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( -; GFX6-LABEL: local_agent_one_as_monotonic_atomicrmw: +define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX7-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX10-WGP-LABEL: local_agent_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_unordered_load( +; GFX6-LABEL: local_agent_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_one_as_monotonic_load( +; GFX6-LABEL: local_agent_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acquire_load( +; GFX6-LABEL: local_agent_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_load( +; GFX6-LABEL: local_agent_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_one_as_unordered_store( +; GFX6-LABEL: local_agent_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_monotonic_store( +; GFX6-LABEL: local_agent_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_release_store( +; GFX6-LABEL: local_agent_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_store( +; GFX6-LABEL: local_agent_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( +; GFX6-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") monotonic + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( +; GFX6-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acquire + ret void +} + +define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( +; GFX6-LABEL: local_agent_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") release + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX10-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") monotonic + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic ret void } -define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( -; GFX6-LABEL: local_agent_one_as_acquire_atomicrmw: +define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( +; GFX6-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX7-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX10-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX10-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acquire + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire ret void } -define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( -; GFX6-LABEL: local_agent_one_as_release_atomicrmw: +define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_release_atomicrmw: +; GFX7-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_release_atomicrmw: +; GFX10-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_release_atomicrmw: +; GFX10-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") release + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire ret void } -define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( -; GFX6-LABEL: local_agent_one_as_acq_rel_atomicrmw: +define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX7-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX10-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX10-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acq_rel + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire ret void } -define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( -; GFX6-LABEL: local_agent_one_as_seq_cst_atomicrmw: +define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX7-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX10-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") seq_cst + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire ret void } -define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( -; GFX6-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX7-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX10-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acquire - store i32 %val, i32 addrspace(3)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire ret void } -define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( -; GFX6-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX7-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acq_rel - store i32 %val, i32 addrspace(3)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst ret void } -define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( -; GFX6-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX7-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") seq_cst - store i32 %val, i32 addrspace(3)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst ret void } -define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( -; GFX6-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( +; GFX6-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4452,7 +6654,7 @@ ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX7-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4463,7 +6665,7 @@ ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4473,7 +6675,7 @@ ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4483,7 +6685,7 @@ ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4495,7 +6697,7 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4505,7 +6707,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4517,12 +6719,12 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst ret void } -define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( -; GFX6-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4534,7 +6736,7 @@ ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX7-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4545,7 +6747,7 @@ ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4555,7 +6757,7 @@ ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4565,7 +6767,7 @@ ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4577,7 +6779,7 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4587,7 +6789,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4599,12 +6801,12 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst ret void } -define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( -; GFX6-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4616,7 +6818,7 @@ ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX7-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4627,7 +6829,7 @@ ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4637,7 +6839,7 @@ ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4647,7 +6849,7 @@ ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4659,7 +6861,7 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4669,7 +6871,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4681,12 +6883,12 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst ret void } -define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( -; GFX6-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4695,10 +6897,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX7-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4706,30 +6910,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4738,37 +6948,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( -; GFX6-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4777,10 +6995,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX7-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4788,30 +7008,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4820,37 +7046,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( -; GFX6-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: +define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4859,10 +7093,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: +; GFX7-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4870,30 +7106,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: +; GFX10-CU-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4902,37 +7144,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( -; GFX6-LABEL: local_agent_one_as_release_acquire_cmpxchg: +define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4941,10 +7191,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_release_acquire_cmpxchg: +; GFX7-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4952,30 +7204,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg: +; GFX10-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4984,37 +7242,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( -; GFX6-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: +define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5023,10 +7289,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX7-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5034,30 +7302,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5066,37 +7340,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( -; GFX6-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: +define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5105,10 +7387,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX7-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5116,30 +7400,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5148,37 +7438,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( -; GFX6-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5187,10 +7485,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5198,30 +7498,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5230,37 +7536,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5274,7 +7588,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5287,7 +7601,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5299,7 +7613,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5311,7 +7625,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5325,7 +7639,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5337,7 +7651,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5351,14 +7665,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5372,7 +7686,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5385,7 +7699,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5397,7 +7711,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5409,7 +7723,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5423,7 +7737,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5435,7 +7749,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5449,14 +7763,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5470,7 +7784,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5483,7 +7797,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5495,7 +7809,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5507,7 +7821,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5521,7 +7835,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5533,7 +7847,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5547,14 +7861,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5568,7 +7882,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5581,7 +7895,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5593,7 +7907,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5605,7 +7919,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5619,7 +7933,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5631,7 +7945,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5645,14 +7959,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( -; GFX6-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5666,7 +7980,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: +; GFX7-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5679,7 +7993,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5691,7 +8005,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5703,7 +8017,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5717,7 +8031,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5729,7 +8043,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5743,14 +8057,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5764,7 +8078,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5777,7 +8091,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5789,7 +8103,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5801,7 +8115,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5815,7 +8129,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5827,7 +8141,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5841,14 +8155,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5862,7 +8176,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5875,7 +8189,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5887,7 +8201,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5899,7 +8213,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5913,7 +8227,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5925,7 +8239,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5939,7 +8253,7 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll @@ -88,8 +88,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") unordered, align 4 @@ -178,8 +176,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") monotonic, align 4 @@ -268,8 +264,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") acquire, align 4 @@ -358,8 +352,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") seq_cst, align 4 @@ -434,8 +426,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") unordered, align 4 @@ -509,8 +499,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") monotonic, align 4 @@ -584,8 +572,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") release, align 4 @@ -659,8 +645,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") seq_cst, align 4 @@ -734,8 +718,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") monotonic @@ -809,8 +791,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acquire @@ -884,8 +864,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") release @@ -959,8 +937,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acq_rel @@ -1034,8 +1010,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") seq_cst @@ -1123,8 +1097,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acquire @@ -1213,8 +1185,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acq_rel @@ -1303,8 +1273,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") seq_cst @@ -1387,8 +1355,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1471,8 +1437,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1555,8 +1519,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1639,8 +1601,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1723,12 +1683,92 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( +; GFX6-LABEL: local_singlethread_monotonic_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm ; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; +; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire ret void } @@ -1807,8 +1847,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1891,8 +1929,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1975,8 +2011,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2059,8 +2093,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2068,8 +2100,8 @@ ret void } -define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( -; GFX6-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2081,7 +2113,7 @@ ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX7-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2092,7 +2124,7 @@ ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2102,7 +2134,7 @@ ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2112,7 +2144,7 @@ ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2124,7 +2156,7 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2134,7 +2166,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2143,17 +2175,15 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst ret void } -define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2162,12 +2192,10 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2175,36 +2203,30 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2213,47 +2235,37 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst ret void } -define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( +; GFX6-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2262,12 +2274,10 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2275,36 +2285,30 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2313,47 +2317,37 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst ret void } -define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2362,12 +2356,10 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2375,36 +2367,30 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2413,47 +2399,37 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst ret void } -define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2462,12 +2438,10 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2475,36 +2449,30 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2513,47 +2481,37 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst ret void } -define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( -; GFX6-LABEL: local_singlethread_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2567,7 +2525,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_release_acquire_ret_cmpxchg: +; GFX7-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2580,7 +2538,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2592,7 +2550,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2604,7 +2562,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2618,7 +2576,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2630,7 +2588,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2641,19 +2599,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2667,7 +2623,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2680,7 +2636,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2692,7 +2648,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2704,7 +2660,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2718,7 +2674,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2730,7 +2686,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2741,19 +2697,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2767,7 +2721,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2780,7 +2734,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2792,7 +2746,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2804,7 +2758,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2818,7 +2772,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2830,7 +2784,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2841,19 +2795,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( -; GFX6-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2867,7 +2819,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX7-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2880,7 +2832,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2892,7 +2844,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2904,7 +2856,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2918,7 +2870,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2930,7 +2882,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2941,1324 +2893,3352 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_unordered_load( -; GFX6-LABEL: local_singlethread_one_as_unordered_load: +define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_unordered_load: +; GFX7-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_unordered_load: +; GFX10-WGP-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_unordered_load: +; GFX10-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_unordered_load: +; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_load: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_unordered_load: +; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") unordered, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( -; GFX6-LABEL: local_singlethread_one_as_monotonic_load: +define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_monotonic_load: +; GFX7-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_load: +; GFX10-WGP-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_load: +; GFX10-CU-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_load: +; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_load: +; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") monotonic, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_acquire_load( -; GFX6-LABEL: local_singlethread_one_as_acquire_load: +define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_acquire_load: +; GFX7-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_load: +; GFX10-WGP-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_acquire_load: +; GFX10-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_load: +; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_load: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_load: +; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") acquire, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( -; GFX6-LABEL: local_singlethread_one_as_seq_cst_load: +define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX7-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX10-WGP-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX10-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_load: +; SKIP-CACHE-INV-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX90A-TGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") seq_cst, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_unordered_store( -; GFX6-LABEL: local_singlethread_one_as_unordered_store: +define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_unordered_store: +; GFX7-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_unordered_store: +; GFX10-WGP-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_unordered_store: +; GFX10-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_unordered_store: +; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_store: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_unordered_store: +; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") unordered, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( -; GFX6-LABEL: local_singlethread_one_as_monotonic_store: +define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_monotonic_store: +; GFX7-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_store: +; GFX10-WGP-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_store: +; GFX10-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_store: +; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_store: +; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") monotonic, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_release_store( -; GFX6-LABEL: local_singlethread_one_as_release_store: +define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_release_store: +; GFX7-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_release_store: +; GFX10-WGP-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_release_store: +; GFX10-CU-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_store: +; SKIP-CACHE-INV-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_store: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_store: +; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") release, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( -; GFX6-LABEL: local_singlethread_one_as_seq_cst_store: +define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX7-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX10-WGP-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 -; GFX10-WGP-NEXT: s_endpgm +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX10-CU-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_store: +; SKIP-CACHE-INV-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") seq_cst, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( -; GFX6-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX7-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX10-WGP-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX10-CU-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm ; +; GFX10-WGP-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm ; - i32 addrspace(3)* %out, i32 %in) { +; GFX10-CU-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") monotonic + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( -; GFX6-LABEL: local_singlethread_one_as_acquire_atomicrmw: +define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_unordered_load( +; GFX6-LABEL: local_singlethread_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( +; GFX6-LABEL: local_singlethread_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acquire_load( +; GFX6-LABEL: local_singlethread_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( +; GFX6-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_unordered_store( +; GFX6-LABEL: local_singlethread_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( +; GFX6-LABEL: local_singlethread_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_release_store( +; GFX6-LABEL: local_singlethread_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( +; GFX6-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( +; GFX6-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( +; GFX6-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( +; GFX6-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") release + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX7-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire ret void } -define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( -; GFX6-LABEL: local_singlethread_one_as_release_atomicrmw: +define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX7-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX10-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") release + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire ret void } -define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( -; GFX6-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX7-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX10-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX10-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire ret void } -define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( -; GFX6-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX7-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire ret void } -define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( -; GFX6-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX7-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire - store i32 %val, i32 addrspace(3)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire ret void } -define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( -; GFX6-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX7-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel - store i32 %val, i32 addrspace(3)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst ret void } -define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( -; GFX6-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX7-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst - store i32 %val, i32 addrspace(3)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst ret void } -define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg( -; GFX6-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4270,7 +6250,7 @@ ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX7-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4281,7 +6261,7 @@ ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4291,7 +6271,7 @@ ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4301,7 +6281,7 @@ ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4313,7 +6293,7 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4323,7 +6303,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4332,17 +6312,15 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst ret void } -define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( -; GFX6-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4354,7 +6332,7 @@ ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX7-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4365,7 +6343,7 @@ ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4375,7 +6353,7 @@ ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4385,7 +6363,7 @@ ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4397,7 +6375,7 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4407,7 +6385,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4416,17 +6394,15 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst ret void } -define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( -; GFX6-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4438,7 +6414,7 @@ ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX7-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4449,7 +6425,7 @@ ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4459,7 +6435,7 @@ ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4469,7 +6445,7 @@ ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4481,7 +6457,7 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4491,7 +6467,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4500,17 +6476,15 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst ret void } -define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( -; GFX6-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4519,10 +6493,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX7-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4530,30 +6506,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4562,39 +6544,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( -; GFX6-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4603,10 +6591,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX7-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4614,30 +6604,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4646,39 +6642,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( -; GFX6-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: +define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4687,10 +6689,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX7-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4698,30 +6702,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4730,39 +6740,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( -; GFX6-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4771,10 +6787,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: +; GFX7-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4782,30 +6800,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4814,39 +6838,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( -; GFX6-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4855,10 +6885,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX7-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4866,30 +6898,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4898,39 +6936,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( -; GFX6-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: +define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4939,10 +6983,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX7-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4950,30 +6996,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4982,39 +7034,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( -; GFX6-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5023,10 +7081,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5034,30 +7094,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5066,39 +7132,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5112,7 +7184,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5125,7 +7197,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5137,7 +7209,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5149,7 +7221,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5163,7 +7235,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5175,7 +7247,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5186,19 +7258,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5212,7 +7282,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5225,7 +7295,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5237,7 +7307,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5249,7 +7319,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5263,7 +7333,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5275,7 +7345,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5286,19 +7356,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5312,7 +7380,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5325,7 +7393,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5337,7 +7405,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5349,7 +7417,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5363,7 +7431,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5375,7 +7443,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5386,19 +7454,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5412,7 +7478,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5425,7 +7491,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5437,7 +7503,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5449,7 +7515,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5463,7 +7529,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5475,7 +7541,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5486,19 +7552,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg( -; GFX6-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5512,7 +7576,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX7-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5525,7 +7589,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5537,7 +7601,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5549,7 +7613,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5563,7 +7627,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5575,7 +7639,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5586,19 +7650,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5612,7 +7674,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5625,7 +7687,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5637,7 +7699,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5649,7 +7711,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5663,7 +7725,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5675,7 +7737,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5686,19 +7748,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5712,7 +7772,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5725,7 +7785,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5737,7 +7797,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5749,7 +7809,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5763,7 +7823,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5775,7 +7835,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5786,12 +7846,10 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void @@ -5886,8 +7944,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -1836,6 +1836,96 @@ ret void } +define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( +; GFX6-LABEL: local_system_monotonic_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in monotonic acquire + ret void +} + define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_system_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry @@ -2220,8 +2310,8 @@ ret void } -define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( -; GFX6-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2235,7 +2325,7 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; GFX7-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2248,7 +2338,7 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2262,7 +2352,7 @@ ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2274,7 +2364,7 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2288,7 +2378,7 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2300,7 +2390,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2314,12 +2404,12 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in monotonic seq_cst ret void } -define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_system_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2328,12 +2418,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2341,37 +2431,38 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2380,46 +2471,43 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acquire monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acquire seq_cst ret void } -define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( +; GFX6-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2429,12 +2517,11 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2443,12 +2530,11 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2457,13 +2543,12 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2471,12 +2556,11 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2486,12 +2570,11 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2499,12 +2582,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2512,22 +2594,18 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acq_rel monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in release seq_cst ret void } -define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2537,12 +2615,11 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2551,12 +2628,11 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2565,13 +2641,12 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2579,12 +2654,11 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2594,12 +2668,11 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2607,12 +2680,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2620,22 +2692,18 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acq_rel seq_cst ret void } -define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: local_system_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2644,12 +2712,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2657,37 +2725,38 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2696,46 +2765,43 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acquire acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst seq_cst ret void } -define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( -; GFX6-LABEL: local_system_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2744,13 +2810,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_release_acquire_ret_cmpxchg: +; GFX7-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2758,41 +2823,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2801,49 +2861,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in release acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2852,13 +2908,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2866,41 +2921,37 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2909,33 +2960,30 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2944,14 +2992,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acq_rel acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2966,7 +3014,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2980,7 +3028,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2991,11 +3039,10 @@ ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3008,7 +3055,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -3023,7 +3070,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3036,7 +3083,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3045,21 +3092,20 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( -; GFX6-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -3074,7 +3120,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX7-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -3088,7 +3134,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3103,7 +3149,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3116,7 +3162,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -3131,7 +3177,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3144,7 +3190,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3160,1287 +3206,3443 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_unordered_load( -; GFX6-LABEL: local_system_one_as_unordered_load: +define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_unordered_load: +; GFX7-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_unordered_load: +; GFX10-WGP-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_unordered_load: +; GFX10-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_unordered_load: +; SKIP-CACHE-INV-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_unordered_load: +; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_unordered_load: +; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") unordered, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_monotonic_load( -; GFX6-LABEL: local_system_one_as_monotonic_load: +define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_monotonic_load: +; GFX7-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_monotonic_load: +; GFX10-WGP-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_monotonic_load: +; GFX10-CU-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_load: +; SKIP-CACHE-INV-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_load: +; GFX90A-TGSPLIT-LABEL: local_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") monotonic, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_acquire_load( -; GFX6-LABEL: local_system_one_as_acquire_load: +define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_acquire_load: +; GFX7-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_acquire_load: +; GFX10-WGP-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_acquire_load: +; GFX10-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_load: +; SKIP-CACHE-INV-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_load: +; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_load: +; GFX90A-TGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") acquire, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_seq_cst_load( -; GFX6-LABEL: local_system_one_as_seq_cst_load: +define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_seq_cst_load: +; GFX7-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_seq_cst_load: +; GFX10-WGP-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_seq_cst_load: +; GFX10-CU-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_load: +; SKIP-CACHE-INV-LABEL: local_system_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_load: +; GFX90A-TGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") seq_cst, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_unordered_store( -; GFX6-LABEL: local_system_one_as_unordered_store: +define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_unordered_store: +; GFX7-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_unordered_store: +; GFX10-WGP-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_unordered_store: +; GFX10-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_unordered_store: +; SKIP-CACHE-INV-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_unordered_store: +; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_unordered_store: +; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") unordered, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_monotonic_store( -; GFX6-LABEL: local_system_one_as_monotonic_store: +define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_monotonic_store: +; GFX7-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_monotonic_store: +; GFX10-WGP-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_monotonic_store: +; GFX10-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_store: +; SKIP-CACHE-INV-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_store: +; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") monotonic, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_release_store( -; GFX6-LABEL: local_system_one_as_release_store: +define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_release_store: +; GFX7-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_release_store: +; GFX10-WGP-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_release_store: +; GFX10-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_release_store: +; SKIP-CACHE-INV-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_store: +; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_store: +; GFX90A-TGSPLIT-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") release, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_seq_cst_store( -; GFX6-LABEL: local_system_one_as_seq_cst_store: +define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_seq_cst_store: +; GFX7-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_seq_cst_store: +; GFX10-WGP-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 -; GFX10-WGP-NEXT: s_endpgm -; -; GFX10-CU-LABEL: local_system_one_as_seq_cst_store: +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_store: +; SKIP-CACHE-INV-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_store: +; GFX90A-TGSPLIT-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") seq_cst, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( -; GFX6-LABEL: local_system_one_as_monotonic_atomicrmw: +define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX7-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX10-WGP-LABEL: local_system_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_unordered_load( +; GFX6-LABEL: local_system_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_one_as_monotonic_load( +; GFX6-LABEL: local_system_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_one_as_acquire_load( +; GFX6-LABEL: local_system_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_load( +; GFX6-LABEL: local_system_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_one_as_unordered_store( +; GFX6-LABEL: local_system_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_monotonic_store( +; GFX6-LABEL: local_system_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_release_store( +; GFX6-LABEL: local_system_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_store( +; GFX6-LABEL: local_system_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( +; GFX6-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( +; GFX6-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire + ret void +} + +define amdgpu_kernel void @local_system_one_as_release_atomicrmw( +; GFX6-LABEL: local_system_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") release + ret void +} + +define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst + ret void +} + +define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic + ret void +} + +define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX10-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") monotonic + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic ret void } -define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( -; GFX6-LABEL: local_system_one_as_acquire_atomicrmw: +define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( +; GFX6-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX7-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX10-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX10-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire ret void } -define amdgpu_kernel void @local_system_one_as_release_atomicrmw( -; GFX6-LABEL: local_system_one_as_release_atomicrmw: +define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_release_atomicrmw: +; GFX7-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_release_atomicrmw: +; GFX10-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_release_atomicrmw: +; GFX10-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_release_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") release + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire ret void } -define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( -; GFX6-LABEL: local_system_one_as_acq_rel_atomicrmw: +define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX7-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX10-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX10-CU-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_system_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire ret void } -define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( -; GFX6-LABEL: local_system_one_as_seq_cst_atomicrmw: +define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX7-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX10-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX10-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire ret void } -define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( -; GFX6-LABEL: local_system_one_as_acquire_ret_atomicrmw: +define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX7-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX10-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX10-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire - store i32 %val, i32 addrspace(3)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire ret void } -define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( -; GFX6-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX7-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel - store i32 %val, i32 addrspace(3)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst ret void } -define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( -; GFX6-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX7-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst - store i32 %val, i32 addrspace(3)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst ret void } -define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( -; GFX6-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( +; GFX6-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4452,7 +6654,7 @@ ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX7-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4463,7 +6665,7 @@ ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4473,7 +6675,7 @@ ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4483,7 +6685,7 @@ ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4495,7 +6697,7 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4505,7 +6707,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4517,12 +6719,12 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst ret void } -define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( -; GFX6-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4534,7 +6736,7 @@ ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX7-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4545,7 +6747,7 @@ ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4555,7 +6757,7 @@ ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4565,7 +6767,7 @@ ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4577,7 +6779,7 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4587,7 +6789,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4599,12 +6801,12 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst ret void } -define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( -; GFX6-LABEL: local_system_one_as_release_monotonic_cmpxchg: +define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4616,7 +6818,7 @@ ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX7-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4627,7 +6829,7 @@ ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4637,7 +6839,7 @@ ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4647,7 +6849,7 @@ ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4659,7 +6861,7 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4669,7 +6871,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4681,12 +6883,12 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst ret void } -define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( -; GFX6-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4695,10 +6897,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX7-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4706,30 +6910,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4738,37 +6948,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( -; GFX6-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4777,10 +6995,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX7-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4788,30 +7008,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4820,37 +7046,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( -; GFX6-LABEL: local_system_one_as_acquire_acquire_cmpxchg: +define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4859,10 +7093,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_acquire_acquire_cmpxchg: +; GFX7-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4870,30 +7106,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg: +; GFX10-CU-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4902,37 +7144,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( -; GFX6-LABEL: local_system_one_as_release_acquire_cmpxchg: +define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4941,10 +7191,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_release_acquire_cmpxchg: +; GFX7-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4952,30 +7204,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_release_acquire_cmpxchg: +; GFX10-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_release_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4984,37 +7242,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( -; GFX6-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: +define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5023,10 +7289,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: +; GFX7-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5034,30 +7302,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5066,37 +7340,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( -; GFX6-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: +define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5105,10 +7387,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: +; GFX7-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5116,30 +7400,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5148,37 +7438,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( -; GFX6-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5187,10 +7485,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5198,30 +7498,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5230,37 +7536,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5274,7 +7588,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5287,7 +7601,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5299,7 +7613,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5311,7 +7625,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5325,7 +7639,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5337,7 +7651,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5351,14 +7665,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5372,7 +7686,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5385,7 +7699,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5397,7 +7711,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5409,7 +7723,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5423,7 +7737,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5435,7 +7749,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5449,14 +7763,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5470,7 +7784,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5483,7 +7797,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5495,7 +7809,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5507,7 +7821,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5521,7 +7835,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5533,7 +7847,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5547,14 +7861,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5568,7 +7882,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5581,7 +7895,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5593,7 +7907,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5605,7 +7919,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5619,7 +7933,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5631,7 +7945,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5645,14 +7959,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( -; GFX6-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5666,7 +7980,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: +; GFX7-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5679,7 +7993,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5691,7 +8005,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5703,7 +8017,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5717,7 +8031,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5729,7 +8043,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5743,14 +8057,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5764,7 +8078,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5777,7 +8091,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5789,7 +8103,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5801,7 +8115,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5815,7 +8129,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5827,7 +8141,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5841,14 +8155,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5862,7 +8176,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5875,7 +8189,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5887,7 +8201,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5899,7 +8213,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5913,7 +8227,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5925,7 +8239,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5939,7 +8253,7 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll @@ -88,8 +88,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") unordered, align 4 @@ -178,8 +176,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") monotonic, align 4 @@ -268,8 +264,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") acquire, align 4 @@ -358,8 +352,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") seq_cst, align 4 @@ -434,8 +426,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") unordered, align 4 @@ -509,8 +499,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") monotonic, align 4 @@ -584,8 +572,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") release, align 4 @@ -659,8 +645,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") seq_cst, align 4 @@ -734,8 +718,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") monotonic @@ -809,8 +791,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acquire @@ -884,8 +864,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") release @@ -959,8 +937,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acq_rel @@ -1034,8 +1010,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") seq_cst @@ -1123,8 +1097,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acquire @@ -1213,8 +1185,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acq_rel @@ -1303,8 +1273,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") seq_cst @@ -1387,8 +1355,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1471,8 +1437,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1555,8 +1519,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1639,8 +1601,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1723,12 +1683,92 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( +; GFX6-LABEL: local_wavefront_monotonic_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm ; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; +; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire ret void } @@ -1807,8 +1847,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1891,8 +1929,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1975,8 +2011,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2059,8 +2093,6 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2068,8 +2100,8 @@ ret void } -define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( -; GFX6-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2081,7 +2113,7 @@ ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX7-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2092,7 +2124,7 @@ ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2102,7 +2134,7 @@ ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2112,7 +2144,7 @@ ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2124,7 +2156,7 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2134,7 +2166,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2143,17 +2175,15 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst ret void } -define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2162,12 +2192,10 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2175,36 +2203,30 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2213,47 +2235,37 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst ret void } -define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( +; GFX6-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2262,12 +2274,10 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2275,36 +2285,30 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2313,47 +2317,37 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst ret void } -define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2362,12 +2356,10 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2375,36 +2367,30 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2413,47 +2399,37 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst ret void } -define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2462,12 +2438,10 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2475,36 +2449,30 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2513,47 +2481,37 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst ret void } -define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( -; GFX6-LABEL: local_wavefront_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2567,7 +2525,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_release_acquire_ret_cmpxchg: +; GFX7-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2580,7 +2538,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2592,7 +2550,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2604,7 +2562,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2618,7 +2576,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2630,7 +2588,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2641,19 +2599,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2667,7 +2623,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2680,7 +2636,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2692,7 +2648,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2704,7 +2660,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2718,7 +2674,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2730,7 +2686,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2741,19 +2697,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2767,7 +2721,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2780,7 +2734,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2792,7 +2746,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2804,7 +2758,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2818,7 +2772,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2830,7 +2784,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2841,19 +2795,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( -; GFX6-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2867,7 +2819,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX7-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2880,7 +2832,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2892,7 +2844,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2904,7 +2856,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2918,7 +2870,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2930,7 +2882,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2941,1324 +2893,3352 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_unordered_load( -; GFX6-LABEL: local_wavefront_one_as_unordered_load: +define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_unordered_load: +; GFX7-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_unordered_load: +; GFX10-WGP-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_unordered_load: +; GFX10-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_unordered_load: +; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_load: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_unordered_load: +; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") unordered, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( -; GFX6-LABEL: local_wavefront_one_as_monotonic_load: +define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_monotonic_load: +; GFX7-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_load: +; GFX10-WGP-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_load: +; GFX10-CU-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_load: +; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_load: +; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") monotonic, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_acquire_load( -; GFX6-LABEL: local_wavefront_one_as_acquire_load: +define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_acquire_load: +; GFX7-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_load: +; GFX10-WGP-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_acquire_load: +; GFX10-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_load: +; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_load: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_load: +; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") acquire, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( -; GFX6-LABEL: local_wavefront_one_as_seq_cst_load: +define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX7-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX10-WGP-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX10-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_load: +; SKIP-CACHE-INV-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX90A-TGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") seq_cst, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_unordered_store( -; GFX6-LABEL: local_wavefront_one_as_unordered_store: +define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_unordered_store: +; GFX7-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_unordered_store: +; GFX10-WGP-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_unordered_store: +; GFX10-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_unordered_store: +; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_store: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_unordered_store: +; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") unordered, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( -; GFX6-LABEL: local_wavefront_one_as_monotonic_store: +define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_monotonic_store: +; GFX7-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_store: +; GFX10-WGP-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_store: +; GFX10-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_store: +; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_store: +; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") monotonic, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_release_store( -; GFX6-LABEL: local_wavefront_one_as_release_store: +define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_release_store: +; GFX7-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_release_store: +; GFX10-WGP-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_release_store: +; GFX10-CU-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_store: +; SKIP-CACHE-INV-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_store: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_store: +; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") release, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( -; GFX6-LABEL: local_wavefront_one_as_seq_cst_store: +define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX7-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX10-WGP-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 -; GFX10-WGP-NEXT: s_endpgm +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX10-CU-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_store: +; SKIP-CACHE-INV-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") seq_cst, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( -; GFX6-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX7-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX10-WGP-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX10-CU-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm ; +; GFX10-WGP-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm ; - i32 addrspace(3)* %out, i32 %in) { +; GFX10-CU-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") monotonic + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( -; GFX6-LABEL: local_wavefront_one_as_acquire_atomicrmw: +define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_unordered_load( +; GFX6-LABEL: local_wavefront_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( +; GFX6-LABEL: local_wavefront_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acquire_load( +; GFX6-LABEL: local_wavefront_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( +; GFX6-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_unordered_store( +; GFX6-LABEL: local_wavefront_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( +; GFX6-LABEL: local_wavefront_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_release_store( +; GFX6-LABEL: local_wavefront_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( +; GFX6-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( +; GFX6-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( +; GFX6-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( +; GFX6-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") release + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX7-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acquire + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire ret void } -define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( -; GFX6-LABEL: local_wavefront_one_as_release_atomicrmw: +define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX7-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX10-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") release + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire ret void } -define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( -; GFX6-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX7-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX10-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX10-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acq_rel + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire ret void } -define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( -; GFX6-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX7-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") seq_cst + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire ret void } -define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( -; GFX6-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX7-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acquire - store i32 %val, i32 addrspace(3)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire ret void } -define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( -; GFX6-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX7-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acq_rel - store i32 %val, i32 addrspace(3)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst ret void } -define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( -; GFX6-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX7-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") seq_cst - store i32 %val, i32 addrspace(3)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst ret void } -define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( -; GFX6-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4270,7 +6250,7 @@ ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX7-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4281,7 +6261,7 @@ ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4291,7 +6271,7 @@ ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4301,7 +6281,7 @@ ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4313,7 +6293,7 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4323,7 +6303,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4332,17 +6312,15 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst ret void } -define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( -; GFX6-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4354,7 +6332,7 @@ ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX7-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4365,7 +6343,7 @@ ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4375,7 +6353,7 @@ ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4385,7 +6363,7 @@ ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4397,7 +6375,7 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4407,7 +6385,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4416,17 +6394,15 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst ret void } -define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( -; GFX6-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4438,7 +6414,7 @@ ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX7-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4449,7 +6425,7 @@ ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4459,7 +6435,7 @@ ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4469,7 +6445,7 @@ ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4481,7 +6457,7 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4491,7 +6467,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4500,17 +6476,15 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst ret void } -define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( -; GFX6-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4519,10 +6493,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX7-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4530,30 +6506,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4562,39 +6544,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( -; GFX6-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4603,10 +6591,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX7-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4614,30 +6604,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4646,39 +6642,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( -; GFX6-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: +define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4687,10 +6689,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX7-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4698,30 +6702,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4730,39 +6740,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( -; GFX6-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4771,10 +6787,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: +; GFX7-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4782,30 +6800,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4814,39 +6838,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( -; GFX6-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4855,10 +6885,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX7-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4866,30 +6898,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4898,39 +6936,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( -; GFX6-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: +define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4939,10 +6983,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX7-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4950,30 +6996,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4982,39 +7034,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( -; GFX6-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5023,10 +7081,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5034,30 +7094,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5066,39 +7132,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5112,7 +7184,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5125,7 +7197,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5137,7 +7209,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5149,7 +7221,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5163,7 +7235,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5175,7 +7247,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5186,19 +7258,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5212,7 +7282,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5225,7 +7295,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5237,7 +7307,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5249,7 +7319,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5263,7 +7333,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5275,7 +7345,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5286,19 +7356,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5312,7 +7380,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5325,7 +7393,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5337,7 +7405,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5349,7 +7417,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5363,7 +7431,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5375,7 +7443,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5386,19 +7454,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5412,7 +7478,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5425,7 +7491,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5437,7 +7503,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5449,7 +7515,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5463,7 +7529,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5475,7 +7541,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5486,19 +7552,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( -; GFX6-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5512,7 +7576,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX7-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5525,7 +7589,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5537,7 +7601,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5549,7 +7613,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5563,7 +7627,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5575,7 +7639,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5586,19 +7650,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5612,7 +7674,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5625,7 +7687,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5637,7 +7699,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5649,7 +7711,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5663,7 +7725,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5675,7 +7737,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5686,19 +7748,17 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5712,7 +7772,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5725,7 +7785,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5737,7 +7797,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5749,7 +7809,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5763,7 +7823,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5775,7 +7835,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5786,12 +7846,10 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void @@ -5886,8 +7944,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -1836,6 +1836,96 @@ ret void } +define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( +; GFX6-LABEL: local_workgroup_monotonic_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_monotonic_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_monotonic_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_acquire_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire + ret void +} + define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; GFX6-LABEL: local_workgroup_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry @@ -2220,8 +2310,8 @@ ret void } -define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( -; GFX6-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2235,7 +2325,7 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX7-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2248,7 +2338,7 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2262,7 +2352,7 @@ ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2274,7 +2364,7 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2288,7 +2378,7 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2300,7 +2390,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2314,12 +2404,12 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic seq_cst ret void } -define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2328,12 +2418,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2341,37 +2431,38 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2380,46 +2471,43 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire seq_cst ret void } -define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( +; GFX6-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2429,12 +2517,11 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2443,12 +2530,11 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2457,13 +2543,12 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2471,12 +2556,11 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2486,12 +2570,11 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2499,12 +2582,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2512,22 +2594,18 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") release seq_cst ret void } -define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2537,12 +2615,11 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2551,12 +2628,11 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2565,13 +2641,12 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2579,12 +2654,11 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2594,12 +2668,11 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2607,12 +2680,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2620,22 +2692,18 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel seq_cst ret void } -define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2644,12 +2712,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2657,37 +2725,38 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2696,46 +2765,43 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32 addrspace(3)* %out, align 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst ret void } -define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( -; GFX6-LABEL: local_workgroup_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2744,13 +2810,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_release_acquire_ret_cmpxchg: +; GFX7-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2758,41 +2823,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2801,49 +2861,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2852,13 +2908,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2866,41 +2921,37 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -2909,33 +2960,30 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2944,14 +2992,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -2966,7 +3014,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -2980,7 +3028,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2991,11 +3039,10 @@ ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3008,7 +3055,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -3023,7 +3070,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3036,7 +3083,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3045,21 +3092,20 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( -; GFX6-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -3074,7 +3120,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX7-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -3088,7 +3134,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3103,7 +3149,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3116,7 +3162,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -3131,7 +3177,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3144,7 +3190,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3160,1287 +3206,3443 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_unordered_load( -; GFX6-LABEL: local_workgroup_one_as_unordered_load: +define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_unordered_load: +; GFX7-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_unordered_load: +; GFX10-WGP-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_unordered_load: +; GFX10-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_unordered_load: +; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_load: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_unordered_load: +; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") unordered, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( -; GFX6-LABEL: local_workgroup_one_as_monotonic_load: +define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_monotonic_load: +; GFX7-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_load: +; GFX10-WGP-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_load: +; GFX10-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_load: +; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_load: +; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") monotonic, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_acquire_load( -; GFX6-LABEL: local_workgroup_one_as_acquire_load: +define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_acquire_load: +; GFX7-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_load: +; GFX10-WGP-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_acquire_load: +; GFX10-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_load: +; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_load: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_load: +; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") acquire, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( -; GFX6-LABEL: local_workgroup_one_as_seq_cst_load: +define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX7-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: ds_read_b32 v0, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX10-WGP-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX10-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_load: +; SKIP-CACHE-INV-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX90A-TGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") seq_cst, align 4 - store i32 %val, i32 addrspace(3)* %out + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_unordered_store( -; GFX6-LABEL: local_workgroup_one_as_unordered_store: +define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_unordered_store: +; GFX7-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_unordered_store: +; GFX10-WGP-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_unordered_store: +; GFX10-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_unordered_store: +; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_store: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_unordered_store: +; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") unordered, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( -; GFX6-LABEL: local_workgroup_one_as_monotonic_store: +define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_monotonic_store: +; GFX7-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_store: +; GFX10-WGP-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_store: +; GFX10-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_store: +; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_store: +; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") monotonic, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_release_store( -; GFX6-LABEL: local_workgroup_one_as_release_store: +define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_release_store: +; GFX7-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_release_store: +; GFX10-WGP-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_release_store: +; GFX10-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_store: +; SKIP-CACHE-INV-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_store: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_store: +; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") release, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( -; GFX6-LABEL: local_workgroup_one_as_seq_cst_store: +define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX7-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX10-WGP-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 -; GFX10-WGP-NEXT: s_endpgm -; -; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_store: +; SKIP-CACHE-INV-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 ; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 %in, i32 addrspace(3)* %out) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") seq_cst, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( -; GFX6-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX7-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX10-WGP-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") release seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_unordered_load( +; GFX6-LABEL: local_workgroup_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_unordered_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( +; GFX6-LABEL: local_workgroup_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acquire_load( +; GFX6-LABEL: local_workgroup_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( +; GFX6-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_load: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_unordered_store( +; GFX6-LABEL: local_workgroup_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_unordered_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( +; GFX6-LABEL: local_workgroup_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_release_store( +; GFX6-LABEL: local_workgroup_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( +; GFX6-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_store: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( +; GFX6-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( +; GFX6-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( +; GFX6-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") release + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm +; +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV: ; %bb.0: ; %entry +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb +; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") monotonic + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic ret void } -define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( -; GFX6-LABEL: local_workgroup_one_as_acquire_atomicrmw: +define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX7-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire ret void } -define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( -; GFX6-LABEL: local_workgroup_one_as_release_atomicrmw: +define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX7-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX10-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") release + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire ret void } -define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( -; GFX6-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX7-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX10-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire ret void } -define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( -; GFX6-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire ret void } -define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( -; GFX6-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire - store i32 %val, i32 addrspace(3)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire ret void } -define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( -; GFX6-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX7-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel - store i32 %val, i32 addrspace(3)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst ret void } -define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( -; GFX6-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX7-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 +; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm - i32 addrspace(3)* %out, i32 %in) { + i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: - %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst - store i32 %val, i32 addrspace(3)* %out, align 4 + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst ret void } -define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( -; GFX6-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4452,7 +6654,7 @@ ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX7-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4463,7 +6665,7 @@ ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4473,7 +6675,7 @@ ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4483,7 +6685,7 @@ ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4495,7 +6697,7 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4505,7 +6707,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4517,12 +6719,12 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst ret void } -define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( -; GFX6-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4534,7 +6736,7 @@ ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX7-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4545,7 +6747,7 @@ ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4555,7 +6757,7 @@ ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4565,7 +6767,7 @@ ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4577,7 +6779,7 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4587,7 +6789,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4599,12 +6801,12 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst ret void } -define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( -; GFX6-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4616,7 +6818,7 @@ ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX7-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4627,7 +6829,7 @@ ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4637,7 +6839,7 @@ ; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4647,7 +6849,7 @@ ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4659,7 +6861,7 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4669,7 +6871,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4681,12 +6883,12 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst ret void } -define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( -; GFX6-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4695,10 +6897,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX7-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4706,30 +6910,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4738,37 +6948,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( -; GFX6-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4777,10 +6995,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX7-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4788,30 +7008,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4820,37 +7046,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( -; GFX6-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: +define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4859,10 +7093,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX7-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4870,30 +7106,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4902,37 +7144,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( -; GFX6-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -4941,10 +7191,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: +; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -4952,30 +7204,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -4984,37 +7242,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( -; GFX6-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5023,10 +7289,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5034,30 +7302,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5066,37 +7340,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( -; GFX6-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: +define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5105,10 +7387,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX7-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5116,30 +7400,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5148,37 +7438,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( -; GFX6-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5187,10 +7485,12 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5198,30 +7498,36 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5230,37 +7536,45 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 -; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5274,7 +7588,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5287,7 +7601,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5299,7 +7613,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5311,7 +7625,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5325,7 +7639,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5337,7 +7651,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5351,14 +7665,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5372,7 +7686,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5385,7 +7699,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5397,7 +7711,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5409,7 +7723,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5423,7 +7737,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5435,7 +7749,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5449,14 +7763,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( -; GFX6-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5470,7 +7784,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5483,7 +7797,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5495,7 +7809,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5507,7 +7821,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5521,7 +7835,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5533,7 +7847,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5547,14 +7861,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( -; GFX6-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5568,7 +7882,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX7-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5581,7 +7895,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5593,7 +7907,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5605,7 +7919,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5619,7 +7933,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5631,7 +7945,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5645,14 +7959,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( -; GFX6-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5666,7 +7980,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX7-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5679,7 +7993,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5691,7 +8005,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5703,7 +8017,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5717,7 +8031,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5729,7 +8043,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5743,14 +8057,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( -; GFX6-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5764,7 +8078,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5777,7 +8091,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5789,7 +8103,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5801,7 +8115,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5815,7 +8129,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5827,7 +8141,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5841,14 +8155,14 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void } -define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( -; GFX6-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x1 @@ -5862,7 +8176,7 @@ ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; -; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -5875,7 +8189,7 @@ ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; -; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5887,7 +8201,7 @@ ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; -; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5899,7 +8213,7 @@ ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; -; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; SKIP-CACHE-INV-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb @@ -5913,7 +8227,7 @@ ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm ; -; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5925,7 +8239,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; -; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -5939,7 +8253,7 @@ i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 - %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, i32 addrspace(3)* %out, align 4 ret void