Index: docs/AMDGPUUsage.rst =================================================================== --- docs/AMDGPUUsage.rst +++ docs/AMDGPUUsage.rst @@ -315,62 +315,80 @@ .. table:: AMDHSA LLVM Sync Scopes :name: amdgpu-amdhsa-llvm-sync-scopes-table - ================ ========================================================== - LLVM Sync Scope Description - ================ ========================================================== - *none* The default: ``system``. - - Synchronizes with, and participates in modification and - seq_cst total orderings with, other operations (except - image operations) for all address spaces (except private, - or generic that accesses private) provided the other - operation's sync scope is: - - - ``system``. - - ``agent`` and executed by a thread on the same agent. - - ``workgroup`` and executed by a thread in the same - workgroup. - - ``wavefront`` and executed by a thread in the same - wavefront. - - ``agent`` Synchronizes with, and participates in modification and - seq_cst total orderings with, other operations (except - image operations) for all address spaces (except private, - or generic that accesses private) provided the other - operation's sync scope is: - - - ``system`` or ``agent`` and executed by a thread on the - same agent. - - ``workgroup`` and executed by a thread in the same - workgroup. - - ``wavefront`` and executed by a thread in the same - wavefront. - - ``workgroup`` Synchronizes with, and participates in modification and - seq_cst total orderings with, other operations (except - image operations) for all address spaces (except private, - or generic that accesses private) provided the other - operation's sync scope is: - - - ``system``, ``agent`` or ``workgroup`` and executed by a - thread in the same workgroup. - - ``wavefront`` and executed by a thread in the same - wavefront. - - ``wavefront`` Synchronizes with, and participates in modification and - seq_cst total orderings with, other operations (except - image operations) for all address spaces (except private, - or generic that accesses private) provided the other - operation's sync scope is: - - - ``system``, ``agent``, ``workgroup`` or ``wavefront`` - and executed by a thread in the same wavefront. - - ``singlethread`` Only synchronizes with, and participates in modification - and seq_cst total orderings with, other operations (except - image operations) running in the same thread for all - address spaces (for example, in signal handlers). - ================ ========================================================== + ======================= =================================================== + LLVM Sync Scope Description + ======================= =================================================== + *none* The default: ``system``. + + Synchronizes with, and participates in modification + and seq_cst total orderings with, other operations + (except image operations) for all address spaces + (except private, or generic that accesses private) + provided the other operation's sync scope is: + + - ``system``. + - ``agent`` and executed by a thread on the same + agent. + - ``workgroup`` and executed by a thread in the + same workgroup. + - ``wavefront`` and executed by a thread in the + same wavefront. + + ``agent`` Synchronizes with, and participates in modification + and seq_cst total orderings with, other operations + (except image operations) for all address spaces + (except private, or generic that accesses private) + provided the other operation's sync scope is: + + - ``system`` or ``agent`` and executed by a thread + on the same agent. + - ``workgroup`` and executed by a thread in the + same workgroup. + - ``wavefront`` and executed by a thread in the + same wavefront. + + ``workgroup`` Synchronizes with, and participates in modification + and seq_cst total orderings with, other operations + (except image operations) for all address spaces + (except private, or generic that accesses private) + provided the other operation's sync scope is: + + - ``system``, ``agent`` or ``workgroup`` and + executed by a thread in the same workgroup. + - ``wavefront`` and executed by a thread in the + same wavefront. + + ``wavefront`` Synchronizes with, and participates in modification + and seq_cst total orderings with, other operations + (except image operations) for all address spaces + (except private, or generic that accesses private) + provided the other operation's sync scope is: + + - ``system``, ``agent``, ``workgroup`` or + ``wavefront`` and executed by a thread in the + same wavefront. + + ``singlethread`` Only synchronizes with, and participates in + modification and seq_cst total orderings with, + other operations (except image operations) running + in the same thread for all address spaces (for + example, in signal handlers). + + ``one-as`` Same as ``system`` but only synchronizes with other + operations within the same address space. + + ``agent-one-as`` Same as ``agent`` but only synchronizes with other + operations within the same address space. + + ``workgroup-one-as`` Same as ``workgroup`` but only synchronizes with + other operations within the same address space. + + ``wavefront-one-as`` Same as ``wavefront`` but only synchronizes with + other operations within the same address space. + + ``singlethread-one-as`` Same as ``singlethread`` but only synchronizes with + other operations within the same address space. + ======================= =================================================== AMDGPU Intrinsics ----------------- Index: lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h +++ lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h @@ -29,12 +29,22 @@ // All supported memory/synchronization scopes can be found here: // http://llvm.org/docs/AMDGPUUsage.html#memory-scopes - /// Agent synchronization scope ID. + /// Agent synchronization scope ID (cross address space). SyncScope::ID AgentSSID; - /// Workgroup synchronization scope ID. + /// Workgroup synchronization scope ID (cross address space). SyncScope::ID WorkgroupSSID; - /// Wavefront synchronization scope ID. + /// Wavefront synchronization scope ID (cross address space). SyncScope::ID WavefrontSSID; + /// System synchronization scope ID (single address space). + SyncScope::ID SystemOneAddressSpaceSSID; + /// Agent synchronization scope ID (single address space). + SyncScope::ID AgentOneAddressSpaceSSID; + /// Workgroup synchronization scope ID (single address space). + SyncScope::ID WorkgroupOneAddressSpaceSSID; + /// Wavefront synchronization scope ID (single address space). + SyncScope::ID WavefrontOneAddressSpaceSSID; + /// Single thread synchronization scope ID (single address space). + SyncScope::ID SingleThreadOneAddressSpaceSSID; /// In AMDGPU target synchronization scopes are inclusive, meaning a /// larger synchronization scope is inclusive of a smaller synchronization @@ -43,35 +53,70 @@ /// \returns \p SSID's inclusion ordering, or "None" if \p SSID is not /// supported by the AMDGPU target. Optional getSyncScopeInclusionOrdering(SyncScope::ID SSID) const { - if (SSID == SyncScope::SingleThread) + if (SSID == SyncScope::SingleThread || + SSID == getSingleThreadOneAddressSpaceSSID()) return 0; - else if (SSID == getWavefrontSSID()) + else if (SSID == getWavefrontSSID() || + SSID == getWavefrontOneAddressSpaceSSID()) return 1; - else if (SSID == getWorkgroupSSID()) + else if (SSID == getWorkgroupSSID() || + SSID == getWorkgroupOneAddressSpaceSSID()) return 2; - else if (SSID == getAgentSSID()) + else if (SSID == getAgentSSID() || + SSID == getAgentOneAddressSpaceSSID()) return 3; - else if (SSID == SyncScope::System) + else if (SSID == SyncScope::System || + SSID == getSystemOneAddressSpaceSSID()) return 4; return None; } + /// \returns True if \p SSID is restricted to single address space, false + /// otherwise + bool isOneAddressSpace(SyncScope::ID SSID) const { + return SSID == getSingleThreadOneAddressSpaceSSID() || + SSID == getWavefrontOneAddressSpaceSSID() || + SSID == getWorkgroupOneAddressSpaceSSID() || + SSID == getAgentOneAddressSpaceSSID() || + SSID == getSystemOneAddressSpaceSSID(); + } + public: AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI); - /// \returns Agent synchronization scope ID. + /// \returns Agent synchronization scope ID (cross address space). SyncScope::ID getAgentSSID() const { return AgentSSID; } - /// \returns Workgroup synchronization scope ID. + /// \returns Workgroup synchronization scope ID (cross address space). SyncScope::ID getWorkgroupSSID() const { return WorkgroupSSID; } - /// \returns Wavefront synchronization scope ID. + /// \returns Wavefront synchronization scope ID (cross address space). SyncScope::ID getWavefrontSSID() const { return WavefrontSSID; } + /// \returns System synchronization scope ID (single address space). + SyncScope::ID getSystemOneAddressSpaceSSID() const { + return SystemOneAddressSpaceSSID; + } + /// \returns Agent synchronization scope ID (single address space). + SyncScope::ID getAgentOneAddressSpaceSSID() const { + return AgentOneAddressSpaceSSID; + } + /// \returns Workgroup synchronization scope ID (single address space). + SyncScope::ID getWorkgroupOneAddressSpaceSSID() const { + return WorkgroupOneAddressSpaceSSID; + } + /// \returns Wavefront synchronization scope ID (single address space). + SyncScope::ID getWavefrontOneAddressSpaceSSID() const { + return WavefrontOneAddressSpaceSSID; + } + /// \returns Single thread synchronization scope ID (single address space). + SyncScope::ID getSingleThreadOneAddressSpaceSSID() const { + return SingleThreadOneAddressSpaceSSID; + } /// In AMDGPU target synchronization scopes are inclusive, meaning a /// larger synchronization scope is inclusive of a smaller synchronization @@ -87,7 +132,11 @@ if (!AIO || !BIO) return None; - return AIO.getValue() > BIO.getValue(); + bool IsAOneAddressSpace = isOneAddressSpace(A); + bool IsBOneAddressSpace = isOneAddressSpace(B); + + return AIO.getValue() >= BIO.getValue() && + (IsAOneAddressSpace == IsBOneAddressSpace || !IsAOneAddressSpace); } }; Index: lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp +++ lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp @@ -23,6 +23,16 @@ AgentSSID = CTX.getOrInsertSyncScopeID("agent"); WorkgroupSSID = CTX.getOrInsertSyncScopeID("workgroup"); WavefrontSSID = CTX.getOrInsertSyncScopeID("wavefront"); + SystemOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("one-as"); + AgentOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("agent-one-as"); + WorkgroupOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("workgroup-one-as"); + WavefrontOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("wavefront-one-as"); + SingleThreadOneAddressSpaceSSID = + CTX.getOrInsertSyncScopeID("singlethread-one-as"); } } // end namespace llvm Index: lib/Target/AMDGPU/SIMemoryLegalizer.cpp =================================================================== --- lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -417,35 +417,46 @@ Optional> SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const { - /// TODO: For now assume OpenCL memory model which treats each - /// address space as having a separate happens-before relation, and - /// so an instruction only has ordering with respect to the address - /// space it accesses, and if it accesses multiple address spaces it - /// does not require ordering of operations in different address - /// spaces. - if (SSID == SyncScope::System) + if (SSID == SyncScope::System) + return std::make_tuple(SIAtomicScope::SYSTEM, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == MMI->getAgentSSID()) + return std::make_tuple(SIAtomicScope::AGENT, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == MMI->getWorkgroupSSID()) + return std::make_tuple(SIAtomicScope::WORKGROUP, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == MMI->getWavefrontSSID()) + return std::make_tuple(SIAtomicScope::WAVEFRONT, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == SyncScope::SingleThread) + return std::make_tuple(SIAtomicScope::SINGLETHREAD, + SIAtomicAddrSpace::ATOMIC, + true); + if (SSID == MMI->getSystemOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - if (SSID == MMI->getAgentSSID()) + if (SSID == MMI->getAgentOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - if (SSID == MMI->getWorkgroupSSID()) + if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - if (SSID == MMI->getWavefrontSSID()) + if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - if (SSID == SyncScope::SingleThread) + if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) return std::make_tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC & InstrScope, false); - /// TODO: To support HSA Memory Model need to add additional memory - /// scopes that specify that do require cross address space - /// ordering. return None; } @@ -721,13 +732,12 @@ bool VMCnt = false; bool LGKMCnt = false; - bool EXPCnt = false; if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: - VMCnt = true; + VMCnt |= true; break; case SIAtomicScope::WORKGROUP: case SIAtomicScope::WAVEFRONT: @@ -751,7 +761,7 @@ // also synchronizing with global/GDS memory as LDS operations // could be reordered with respect to later global/GDS memory // operations of the same wave. - LGKMCnt = IsCrossAddrSpaceOrdering; + LGKMCnt |= IsCrossAddrSpaceOrdering; break; case SIAtomicScope::WAVEFRONT: case SIAtomicScope::SINGLETHREAD: @@ -773,7 +783,7 @@ // also synchronizing with global/LDS memory as GDS operations // could be reordered with respect to later global/LDS memory // operations of the same wave. - EXPCnt = IsCrossAddrSpaceOrdering; + LGKMCnt |= IsCrossAddrSpaceOrdering; break; case SIAtomicScope::WORKGROUP: case SIAtomicScope::WAVEFRONT: @@ -786,11 +796,11 @@ } } - if (VMCnt || LGKMCnt || EXPCnt) { + if (VMCnt || LGKMCnt) { unsigned WaitCntImmediate = AMDGPU::encodeWaitcnt(IV, VMCnt ? 0 : getVmcntBitMask(IV), - EXPCnt ? 0 : getExpcntBitMask(IV), + getExpcntBitMask(IV), LGKMCnt ? 0 : getLgkmcntBitMask(IV)); BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); Changed = true; Index: test/CodeGen/AMDGPU/atomicrmw-nand.ll =================================================================== --- test/CodeGen/AMDGPU/atomicrmw-nand.ll +++ test/CodeGen/AMDGPU/atomicrmw-nand.ll @@ -12,8 +12,10 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_not_b32_e32 v1, v2 ; GCN-NEXT: v_or_b32_e32 v1, -5, v1 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7] Index: test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -27,9 +27,9 @@ %tmp1 = zext i32 %tmp to i64 %tmp2 = getelementptr inbounds [448 x i32], [448 x i32] addrspace(3)* @0, i32 0, i32 %tmp %tmp3 = load i32, i32 addrspace(3)* %tmp2, align 4 - fence syncscope("workgroup") release + fence syncscope("workgroup-one-as") release tail call void @llvm.amdgcn.s.barrier() - fence syncscope("workgroup") acquire + fence syncscope("workgroup-one-as") acquire %tmp4 = add nsw i32 %tmp3, %tmp3 %tmp5 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp4, i32 177, i32 15, i32 15, i1 zeroext false) %tmp6 = add nsw i32 %tmp5, %tmp4 Index: test/CodeGen/AMDGPU/local-atomics-fp.ll =================================================================== --- test/CodeGen/AMDGPU/local-atomics-fp.ll +++ test/CodeGen/AMDGPU/local-atomics-fp.ll @@ -34,7 +34,7 @@ ; HAS-ATOMICS-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 ; HAS-ATOMICS: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 ; HAS-ATOMICS: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64 -; HAS-ATOMICS: s_waitcnt lgkmcnt(1) +; HAS-ATOMICS: s_waitcnt vmcnt(0) lgkmcnt(0) ; HAS-ATOMICS: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] define amdgpu_kernel void @lds_ds_fadd(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { %idx.add = add nuw i32 %idx, 4 @@ -49,6 +49,27 @@ ret void } +; GCN-LABEL: {{^}}lds_ds_fadd_one_as: +; VI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 +; HAS-ATOMICS-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 +; HAS-ATOMICS: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 +; HAS-ATOMICS: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64 +; HAS-ATOMICS: s_waitcnt lgkmcnt(1) +; HAS-ATOMICS: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] +define amdgpu_kernel void @lds_ds_fadd_one_as(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { + %idx.add = add nuw i32 %idx, 4 + %shl0 = shl i32 %idx.add, 3 + %shl1 = shl i32 %idx.add, 4 + %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* + %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* + %a1 = atomicrmw fadd float addrspace(3)* %ptr0, float 4.2e+1 syncscope("one-as") seq_cst + %a2 = atomicrmw fadd float addrspace(3)* %ptr1, float 4.2e+1 syncscope("one-as") seq_cst + %a3 = atomicrmw fadd float addrspace(3)* %ptrf, float %a1 syncscope("one-as") seq_cst + store float %a3, float addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}lds_atomic_fadd_ret_f64: ; GCN: ds_read_b64 ; GCN: v_add_f64 Index: test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll +++ test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll @@ -6,7 +6,7 @@ ; FUNC-LABEL: {{^}}system_acquire: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @system_acquire() { @@ -18,7 +18,7 @@ ; FUNC-LABEL: {{^}}system_release: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: s_endpgm define amdgpu_kernel void @system_release() { entry: @@ -29,7 +29,7 @@ ; FUNC-LABEL: {{^}}system_acq_rel: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @system_acq_rel() { @@ -41,7 +41,7 @@ ; FUNC-LABEL: {{^}}system_seq_cst: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @system_seq_cst() { @@ -50,6 +50,53 @@ ret void } +; FUNC-LABEL: {{^}}system_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_acquire() { +entry: + fence syncscope("one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}system_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_release() { +entry: + fence syncscope("one-as") release + ret void +} + +; FUNC-LABEL: {{^}}system_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_acq_rel() { +entry: + fence syncscope("one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}system_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_seq_cst() { +entry: + fence syncscope("one-as") seq_cst + ret void +} + ; FUNC-LABEL: {{^}}singlethread_acquire: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE @@ -90,10 +137,50 @@ ret void } +; FUNC-LABEL: {{^}}singlethread_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_acquire() { +entry: + fence syncscope("singlethread-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_release() { +entry: + fence syncscope("singlethread-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_acq_rel() { +entry: + fence syncscope("singlethread-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_seq_cst() { +entry: + fence syncscope("singlethread-one-as") seq_cst + ret void +} + ; FUNC-LABEL: {{^}}agent_acquire: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_acquire() { @@ -105,7 +192,7 @@ ; FUNC-LABEL: {{^}}agent_release: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_release() { entry: @@ -116,7 +203,7 @@ ; FUNC-LABEL: {{^}}agent_acq_rel: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_acq_rel() { @@ -128,7 +215,7 @@ ; FUNC-LABEL: {{^}}agent_seq_cst: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_seq_cst() { @@ -137,9 +224,56 @@ ret void } +; FUNC-LABEL: {{^}}agent_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_acquire() { +entry: + fence syncscope("agent-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_release() { +entry: + fence syncscope("agent-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_acq_rel() { +entry: + fence syncscope("agent-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_seq_cst() { +entry: + fence syncscope("agent-one-as") seq_cst + ret void +} + ; FUNC-LABEL: {{^}}workgroup_acquire: ; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_acquire() { @@ -150,7 +284,7 @@ ; FUNC-LABEL: {{^}}workgroup_release: ; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_release() { @@ -161,7 +295,7 @@ ; FUNC-LABEL: {{^}}workgroup_acq_rel: ; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_acq_rel() { @@ -172,7 +306,7 @@ ; FUNC-LABEL: {{^}}workgroup_seq_cst: ; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_seq_cst() { @@ -181,6 +315,50 @@ ret void } +; FUNC-LABEL: {{^}}workgroup_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_acquire() { +entry: + fence syncscope("workgroup-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_release: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_release() { +entry: + fence syncscope("workgroup-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_acq_rel() { +entry: + fence syncscope("workgroup-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_seq_cst() { +entry: + fence syncscope("workgroup-one-as") seq_cst + ret void +} + ; FUNC-LABEL: {{^}}wavefront_acquire: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE @@ -220,3 +398,43 @@ fence syncscope("wavefront") seq_cst ret void } + +; FUNC-LABEL: {{^}}wavefront_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_acquire() { +entry: + fence syncscope("wavefront-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_release() { +entry: + fence syncscope("wavefront-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_acq_rel() { +entry: + fence syncscope("wavefront-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_seq_cst() { +entry: + fence syncscope("wavefront-one-as") seq_cst + ret void +} Index: test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll +++ test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll @@ -2,9 +2,9 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; GCN-LABEL: {{^}}system_monotonic_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @system_monotonic_monotonic( i32* %out, i32 %in, i32 %old) { @@ -15,9 +15,9 @@ } ; GCN-LABEL: {{^}}system_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_acquire_monotonic( i32* %out, i32 %in, i32 %old) { @@ -28,9 +28,9 @@ } ; GCN-LABEL: {{^}}system_release_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @system_release_monotonic( i32* %out, i32 %in, i32 %old) { @@ -41,9 +41,9 @@ } ; GCN-LABEL: {{^}}system_acq_rel_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_acq_rel_monotonic( i32* %out, i32 %in, i32 %old) { @@ -54,9 +54,9 @@ } ; GCN-LABEL: {{^}}system_seq_cst_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_seq_cst_monotonic( i32* %out, i32 %in, i32 %old) { @@ -67,9 +67,9 @@ } ; GCN-LABEL: {{^}}system_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_acquire_acquire( i32* %out, i32 %in, i32 %old) { @@ -80,9 +80,9 @@ } ; GCN-LABEL: {{^}}system_release_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_release_acquire( i32* %out, i32 %in, i32 %old) { @@ -93,9 +93,9 @@ } ; GCN-LABEL: {{^}}system_acq_rel_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_acq_rel_acquire( i32* %out, i32 %in, i32 %old) { @@ -106,9 +106,9 @@ } ; GCN-LABEL: {{^}}system_seq_cst_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_seq_cst_acquire( i32* %out, i32 %in, i32 %old) { @@ -119,9 +119,9 @@ } ; GCN-LABEL: {{^}}system_seq_cst_seq_cst: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_seq_cst_seq_cst( i32* %out, i32 %in, i32 %old) { @@ -131,11 +131,141 @@ ret void } -; GCN-LABEL: {{^}}singlethread_monotonic_monotonic: +; GCN-LABEL: {{^}}system_one_as_monotonic_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_monotonic_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic + ret void +} + +; GCN-LABEL: {{^}}system_one_as_acquire_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_acquire_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + ret void +} + +; GCN-LABEL: {{^}}system_one_as_release_monotonic: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_release_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic + ret void +} + +; GCN-LABEL: {{^}}system_one_as_acq_rel_monotonic: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_acq_rel_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + ret void +} + +; GCN-LABEL: {{^}}system_one_as_seq_cst_monotonic: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_seq_cst_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + ret void +} + +; GCN-LABEL: {{^}}system_one_as_acquire_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_acquire_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + ret void +} + +; GCN-LABEL: {{^}}system_one_as_release_acquire: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_release_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release acquire + ret void +} + +; GCN-LABEL: {{^}}system_one_as_acq_rel_acquire: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_acq_rel_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire + ret void +} + +; GCN-LABEL: {{^}}system_one_as_seq_cst_acquire: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_seq_cst_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire + ret void +} + +; GCN-LABEL: {{^}}system_one_as_seq_cst_seq_cst: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_seq_cst_seq_cst( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst + ret void +} + +; GCN-LABEL: {{^}}singlethread_monotonic_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_monotonic_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -145,9 +275,9 @@ } ; GCN-LABEL: {{^}}singlethread_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_acquire_monotonic( i32* %out, i32 %in, i32 %old) { @@ -158,9 +288,9 @@ } ; GCN-LABEL: {{^}}singlethread_release_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_release_monotonic( i32* %out, i32 %in, i32 %old) { @@ -171,9 +301,9 @@ } ; GCN-LABEL: {{^}}singlethread_acq_rel_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_acq_rel_monotonic( i32* %out, i32 %in, i32 %old) { @@ -184,9 +314,9 @@ } ; GCN-LABEL: {{^}}singlethread_seq_cst_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_seq_cst_monotonic( i32* %out, i32 %in, i32 %old) { @@ -197,9 +327,9 @@ } ; GCN-LABEL: {{^}}singlethread_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_acquire_acquire( i32* %out, i32 %in, i32 %old) { @@ -210,9 +340,9 @@ } ; GCN-LABEL: {{^}}singlethread_release_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_release_acquire( i32* %out, i32 %in, i32 %old) { @@ -223,9 +353,9 @@ } ; GCN-LABEL: {{^}}singlethread_acq_rel_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_acq_rel_acquire( i32* %out, i32 %in, i32 %old) { @@ -236,9 +366,9 @@ } ; GCN-LABEL: {{^}}singlethread_seq_cst_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_seq_cst_acquire( i32* %out, i32 %in, i32 %old) { @@ -249,9 +379,9 @@ } ; GCN-LABEL: {{^}}singlethread_seq_cst_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_seq_cst_seq_cst( i32* %out, i32 %in, i32 %old) { @@ -261,11 +391,141 @@ ret void } -; GCN-LABEL: {{^}}agent_monotonic_monotonic: +; GCN-LABEL: {{^}}singlethread_one_as_monotonic_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_monotonic_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_acquire_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_acquire_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_release_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_release_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_acq_rel_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_acq_rel_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_seq_cst_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_seq_cst_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_acquire_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_acquire_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_release_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_release_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_acq_rel_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_acq_rel_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_seq_cst_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_seq_cst_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_seq_cst_seq_cst: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_seq_cst_seq_cst( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst + ret void +} + +; GCN-LABEL: {{^}}agent_monotonic_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_monotonic_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -275,9 +535,9 @@ } ; GCN-LABEL: {{^}}agent_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_acquire_monotonic( i32* %out, i32 %in, i32 %old) { @@ -288,9 +548,9 @@ } ; GCN-LABEL: {{^}}agent_release_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_release_monotonic( i32* %out, i32 %in, i32 %old) { @@ -301,9 +561,9 @@ } ; GCN-LABEL: {{^}}agent_acq_rel_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_acq_rel_monotonic( i32* %out, i32 %in, i32 %old) { @@ -314,9 +574,9 @@ } ; GCN-LABEL: {{^}}agent_seq_cst_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_seq_cst_monotonic( i32* %out, i32 %in, i32 %old) { @@ -327,9 +587,9 @@ } ; GCN-LABEL: {{^}}agent_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_acquire_acquire( i32* %out, i32 %in, i32 %old) { @@ -340,9 +600,9 @@ } ; GCN-LABEL: {{^}}agent_release_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_release_acquire( i32* %out, i32 %in, i32 %old) { @@ -353,9 +613,9 @@ } ; GCN-LABEL: {{^}}agent_acq_rel_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_acq_rel_acquire( i32* %out, i32 %in, i32 %old) { @@ -366,9 +626,9 @@ } ; GCN-LABEL: {{^}}agent_seq_cst_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_seq_cst_acquire( i32* %out, i32 %in, i32 %old) { @@ -379,9 +639,9 @@ } ; GCN-LABEL: {{^}}agent_seq_cst_seq_cst: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_seq_cst_seq_cst( i32* %out, i32 %in, i32 %old) { @@ -391,11 +651,141 @@ ret void } -; GCN-LABEL: {{^}}workgroup_monotonic_monotonic: +; GCN-LABEL: {{^}}agent_one_as_monotonic_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_monotonic_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_acquire_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_acquire_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_release_monotonic: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_release_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_acq_rel_monotonic: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_acq_rel_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_seq_cst_monotonic: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_seq_cst_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_acquire_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_acquire_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_release_acquire: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_release_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_acq_rel_acquire: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_acq_rel_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_seq_cst_acquire: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_seq_cst_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_seq_cst_seq_cst: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_seq_cst_seq_cst( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst + ret void +} + +; GCN-LABEL: {{^}}workgroup_monotonic_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_monotonic_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -405,9 +795,9 @@ } ; GCN-LABEL: {{^}}workgroup_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_acquire_monotonic( i32* %out, i32 %in, i32 %old) { @@ -418,9 +808,9 @@ } ; GCN-LABEL: {{^}}workgroup_release_monotonic: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_release_monotonic( i32* %out, i32 %in, i32 %old) { @@ -431,9 +821,9 @@ } ; GCN-LABEL: {{^}}workgroup_acq_rel_monotonic: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_acq_rel_monotonic( i32* %out, i32 %in, i32 %old) { @@ -444,9 +834,9 @@ } ; GCN-LABEL: {{^}}workgroup_seq_cst_monotonic: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_seq_cst_monotonic( i32* %out, i32 %in, i32 %old) { @@ -457,9 +847,9 @@ } ; GCN-LABEL: {{^}}workgroup_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_acquire_acquire( i32* %out, i32 %in, i32 %old) { @@ -470,9 +860,9 @@ } ; GCN-LABEL: {{^}}workgroup_release_acquire: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_release_acquire( i32* %out, i32 %in, i32 %old) { @@ -483,9 +873,9 @@ } ; GCN-LABEL: {{^}}workgroup_acq_rel_acquire: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_acq_rel_acquire( i32* %out, i32 %in, i32 %old) { @@ -496,9 +886,9 @@ } ; GCN-LABEL: {{^}}workgroup_seq_cst_acquire: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_seq_cst_acquire( i32* %out, i32 %in, i32 %old) { @@ -509,9 +899,9 @@ } ; GCN-LABEL: {{^}}workgroup_seq_cst_seq_cst: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_seq_cst_seq_cst( i32* %out, i32 %in, i32 %old) { @@ -521,11 +911,141 @@ ret void } -; GCN-LABEL: {{^}}wavefront_monotonic_monotonic: +; GCN-LABEL: {{^}}workgroup_one_as_monotonic_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_monotonic_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acquire_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_acquire_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_release_monotonic: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_release_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acq_rel_monotonic: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_acq_rel_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_monotonic: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_seq_cst_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acquire_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_acquire_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_release_acquire: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_release_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acq_rel_acquire: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_acq_rel_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_acquire: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_seq_cst_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_seq_cst: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_seq_cst_seq_cst( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst + ret void +} + +; GCN-LABEL: {{^}}wavefront_monotonic_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_monotonic_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -535,9 +1055,9 @@ } ; GCN-LABEL: {{^}}wavefront_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_acquire_monotonic( i32* %out, i32 %in, i32 %old) { @@ -548,9 +1068,9 @@ } ; GCN-LABEL: {{^}}wavefront_release_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_release_monotonic( i32* %out, i32 %in, i32 %old) { @@ -561,9 +1081,9 @@ } ; GCN-LABEL: {{^}}wavefront_acq_rel_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_acq_rel_monotonic( i32* %out, i32 %in, i32 %old) { @@ -574,9 +1094,9 @@ } ; GCN-LABEL: {{^}}wavefront_seq_cst_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_seq_cst_monotonic( i32* %out, i32 %in, i32 %old) { @@ -587,9 +1107,9 @@ } ; GCN-LABEL: {{^}}wavefront_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_acquire_acquire( i32* %out, i32 %in, i32 %old) { @@ -600,9 +1120,9 @@ } ; GCN-LABEL: {{^}}wavefront_release_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_release_acquire( i32* %out, i32 %in, i32 %old) { @@ -613,9 +1133,9 @@ } ; GCN-LABEL: {{^}}wavefront_acq_rel_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_acq_rel_acquire( i32* %out, i32 %in, i32 %old) { @@ -626,9 +1146,9 @@ } ; GCN-LABEL: {{^}}wavefront_seq_cst_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_seq_cst_acquire( i32* %out, i32 %in, i32 %old) { @@ -639,9 +1159,9 @@ } ; GCN-LABEL: {{^}}wavefront_seq_cst_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_seq_cst_seq_cst( i32* %out, i32 %in, i32 %old) { @@ -650,3 +1170,133 @@ %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst ret void } + +; GCN-LABEL: {{^}}wavefront_one_as_monotonic_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_monotonic_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_acquire_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_acquire_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_release_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_release_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_acq_rel_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_acq_rel_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_seq_cst_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_acquire_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_acquire_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_release_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_release_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_acq_rel_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_acq_rel_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_seq_cst_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_seq_cst: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_seq_cst_seq_cst( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst + ret void +} Index: test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll +++ test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX8,GFX68 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX8,GFX68 %s -; FUNC-LABEL: {{^}}system_acquire: +; FUNC-LABEL: {{^}}system_one_as_acquire: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GFX6: s_waitcnt vmcnt(0){{$}} @@ -10,6 +10,232 @@ ; GFX8: s_waitcnt vmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol{{$}} ; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_acquire() { +entry: + fence syncscope("one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}system_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_release() { +entry: + fence syncscope("one-as") release + ret void +} + +; FUNC-LABEL: {{^}}system_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX6: buffer_wbinvl1{{$}} +; GFX8: buffer_wbinvl1_vol{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_acq_rel() { +entry: + fence syncscope("one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}system_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX6: buffer_wbinvl1{{$}} +; GFX8: buffer_wbinvl1_vol{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_seq_cst() { +entry: + fence syncscope("one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_acquire() { +entry: + fence syncscope("singlethread-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_release() { +entry: + fence syncscope("singlethread-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_acq_rel() { +entry: + fence syncscope("singlethread-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_seq_cst() { +entry: + fence syncscope("singlethread-one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GFX6: s_waitcnt vmcnt(0){{$}} +; GFX6-NEXT: buffer_wbinvl1{{$}} +; GFX8: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_acquire() { +entry: + fence syncscope("agent-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_release() { +entry: + fence syncscope("agent-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX6: buffer_wbinvl1{{$}} +; GFX8: buffer_wbinvl1_vol{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_acq_rel() { +entry: + fence syncscope("agent-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX6: buffer_wbinvl1{{$}} +; GFX8: buffer_wbinvl1_vol{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_seq_cst() { +entry: + fence syncscope("agent-one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_acquire: +; GCN: %bb.0 +; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_acquire() { +entry: + fence syncscope("workgroup-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_release: +; GCN: %bb.0 +; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_release() { +entry: + fence syncscope("workgroup-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_acq_rel: +; GCN: %bb.0 +; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_acq_rel() { +entry: + fence syncscope("workgroup-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_seq_cst: +; GCN: %bb.0 +; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_seq_cst() { +entry: + fence syncscope("workgroup-one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_acquire() { +entry: + fence syncscope("wavefront-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_release() { +entry: + fence syncscope("wavefront-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_acq_rel() { +entry: + fence syncscope("wavefront-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_seq_cst() { +entry: + fence syncscope("wavefront-one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}system_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX6-NEXT: buffer_wbinvl1{{$}} +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol{{$}} +; GCN: s_endpgm define amdgpu_kernel void @system_acquire() { entry: fence acquire @@ -19,7 +245,7 @@ ; FUNC-LABEL: {{^}}system_release: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: s_endpgm define amdgpu_kernel void @system_release() { entry: @@ -30,7 +256,7 @@ ; FUNC-LABEL: {{^}}system_acq_rel: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX6: buffer_wbinvl1{{$}} ; GFX8: buffer_wbinvl1_vol{{$}} ; GCN: s_endpgm @@ -43,7 +269,7 @@ ; FUNC-LABEL: {{^}}system_seq_cst: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX6: buffer_wbinvl1{{$}} ; GFX8: buffer_wbinvl1_vol{{$}} ; GCN: s_endpgm @@ -96,9 +322,9 @@ ; FUNC-LABEL: {{^}}agent_acquire: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GFX6: s_waitcnt vmcnt(0){{$}} +; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX6-NEXT: buffer_wbinvl1{{$}} -; GFX8: s_waitcnt vmcnt(0){{$}} +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol{{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_acquire() { @@ -110,7 +336,7 @@ ; FUNC-LABEL: {{^}}agent_release: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_release() { entry: @@ -121,7 +347,7 @@ ; FUNC-LABEL: {{^}}agent_acq_rel: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX6: buffer_wbinvl1{{$}} ; GFX8: buffer_wbinvl1_vol{{$}} ; GCN: s_endpgm @@ -134,7 +360,7 @@ ; FUNC-LABEL: {{^}}agent_seq_cst: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX6: buffer_wbinvl1{{$}} ; GFX8: buffer_wbinvl1_vol{{$}} ; GCN: s_endpgm @@ -146,7 +372,7 @@ ; FUNC-LABEL: {{^}}workgroup_acquire: ; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GFX68-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_acquire() { @@ -157,7 +383,7 @@ ; FUNC-LABEL: {{^}}workgroup_release: ; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GFX68-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_release() { @@ -168,7 +394,7 @@ ; FUNC-LABEL: {{^}}workgroup_acq_rel: ; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GFX68-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_acq_rel() { @@ -179,7 +405,7 @@ ; FUNC-LABEL: {{^}}workgroup_seq_cst: ; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GFX68-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_seq_cst() { Index: test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir +++ test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir @@ -104,7 +104,7 @@ S_WAITCNT 127 $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec S_WAITCNT 3952 - BUFFER_ATOMIC_SMAX_ADDR64 killed $vgpr0, killed $vgpr1_vgpr2, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 400, 0, implicit $exec :: (volatile load seq_cst 4 from %ir.gep) + BUFFER_ATOMIC_SMAX_ADDR64 killed $vgpr0, killed $vgpr1_vgpr2, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 400, 0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from %ir.gep) bb.2.exit: liveins: $sgpr2_sgpr3 Index: test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll +++ test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll @@ -1,11 +1,311 @@ ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; GCN-LABEL: {{^}}system_monotonic: +; GCN-LABEL: {{^}}system_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_monotonic( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") monotonic + ret void +} + +; GCN-LABEL: {{^}}system_one_as_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_acquire( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire + ret void +} + +; GCN-LABEL: {{^}}system_one_as_release: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_release( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") release + ret void +} + +; GCN-LABEL: {{^}}system_one_as_acq_rel: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_acq_rel( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel + ret void +} + +; GCN-LABEL: {{^}}system_one_as_seq_cst: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_one_as_seq_cst( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_monotonic( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") monotonic + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_acquire( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_release: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_release( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") release + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_acq_rel: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_acq_rel( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel + ret void +} + +; GCN-LABEL: {{^}}singlethread_one_as_seq_cst: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_one_as_seq_cst( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_monotonic( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") monotonic + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_acquire( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_release: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_release( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") release + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_acq_rel: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_acq_rel( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_seq_cst: +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_one_as_seq_cst( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_monotonic( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") monotonic + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_acquire( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_release: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_release( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") release + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acq_rel: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_acq_rel( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_one_as_seq_cst( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_monotonic( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") monotonic + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_acquire( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_release: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_release( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") release + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_acq_rel: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_acq_rel( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_seq_cst: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_one_as_seq_cst( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst + ret void +} + +; GCN-LABEL: {{^}}system_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @system_monotonic( i32* %out, i32 %in) { entry: @@ -14,9 +314,9 @@ } ; GCN-LABEL: {{^}}system_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_acquire( i32* %out, i32 %in) { @@ -26,9 +326,9 @@ } ; GCN-LABEL: {{^}}system_release: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @system_release( i32* %out, i32 %in) { @@ -38,9 +338,9 @@ } ; GCN-LABEL: {{^}}system_acq_rel: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_acq_rel( i32* %out, i32 %in) { @@ -50,9 +350,9 @@ } ; GCN-LABEL: {{^}}system_seq_cst: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @system_seq_cst( i32* %out, i32 %in) { @@ -62,9 +362,9 @@ } ; GCN-LABEL: {{^}}singlethread_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_monotonic( i32* %out, i32 %in) { @@ -74,9 +374,9 @@ } ; GCN-LABEL: {{^}}singlethread_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_acquire( i32* %out, i32 %in) { @@ -86,9 +386,9 @@ } ; GCN-LABEL: {{^}}singlethread_release: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_release( i32* %out, i32 %in) { @@ -98,9 +398,9 @@ } ; GCN-LABEL: {{^}}singlethread_acq_rel: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_acq_rel( i32* %out, i32 %in) { @@ -110,9 +410,9 @@ } ; GCN-LABEL: {{^}}singlethread_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @singlethread_seq_cst( i32* %out, i32 %in) { @@ -122,9 +422,9 @@ } ; GCN-LABEL: {{^}}agent_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_monotonic( i32* %out, i32 %in) { @@ -134,9 +434,9 @@ } ; GCN-LABEL: {{^}}agent_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_acquire( i32* %out, i32 %in) { @@ -146,9 +446,9 @@ } ; GCN-LABEL: {{^}}agent_release: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_release( i32* %out, i32 %in) { @@ -158,9 +458,9 @@ } ; GCN-LABEL: {{^}}agent_acq_rel: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_acq_rel( i32* %out, i32 %in) { @@ -170,9 +470,9 @@ } ; GCN-LABEL: {{^}}agent_seq_cst: -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol define amdgpu_kernel void @agent_seq_cst( i32* %out, i32 %in) { @@ -182,9 +482,9 @@ } ; GCN-LABEL: {{^}}workgroup_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_monotonic( i32* %out, i32 %in) { @@ -194,9 +494,9 @@ } ; GCN-LABEL: {{^}}workgroup_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_acquire( i32* %out, i32 %in) { @@ -206,9 +506,9 @@ } ; GCN-LABEL: {{^}}workgroup_release: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_release( i32* %out, i32 %in) { @@ -218,9 +518,9 @@ } ; GCN-LABEL: {{^}}workgroup_acq_rel: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_acq_rel( i32* %out, i32 %in) { @@ -230,9 +530,9 @@ } ; GCN-LABEL: {{^}}workgroup_seq_cst: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @workgroup_seq_cst( i32* %out, i32 %in) { @@ -242,9 +542,9 @@ } ; GCN-LABEL: {{^}}wavefront_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_monotonic( i32* %out, i32 %in) { @@ -254,9 +554,9 @@ } ; GCN-LABEL: {{^}}wavefront_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_acquire( i32* %out, i32 %in) { @@ -266,9 +566,9 @@ } ; GCN-LABEL: {{^}}wavefront_release: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_release( i32* %out, i32 %in) { @@ -278,9 +578,9 @@ } ; GCN-LABEL: {{^}}wavefront_acq_rel: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_acq_rel( i32* %out, i32 %in) { @@ -290,9 +590,9 @@ } ; GCN-LABEL: {{^}}wavefront_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: buffer_wbinvl1_vol define amdgpu_kernel void @wavefront_seq_cst( i32* %out, i32 %in) { Index: test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir +++ test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir @@ -11,7 +11,7 @@ $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3 $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $sgpr2_sgpr3, implicit $exec - renamable $vgpr2 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load seq_cst 4 from `i32 addrspace(42)* undef`) + renamable $vgpr2 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(42)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -30,7 +30,7 @@ $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store syncscope("agent") seq_cst 4 into `i32 addrspace(42)* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(42)* undef`) S_ENDPGM 0 ... @@ -47,7 +47,7 @@ $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $exec $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_ATOMIC_CMPSWAP killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("workgroup") seq_cst seq_cst 4 on `i32 addrspace(42)* undef`) + FLAT_ATOMIC_CMPSWAP killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("workgroup-one-as") seq_cst seq_cst 4 on `i32 addrspace(42)* undef`) S_ENDPGM 0 ... @@ -63,7 +63,7 @@ $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3 $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $sgpr2_sgpr3, implicit $exec $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - FLAT_ATOMIC_SWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("wavefront") seq_cst 4 on `i32 addrspace(42)* undef`) + FLAT_ATOMIC_SWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("wavefront-one-as") seq_cst 4 on `i32 addrspace(42)* undef`) S_ENDPGM 0 ... Index: test/CodeGen/AMDGPU/memory-legalizer-load.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-load.ll +++ test/CodeGen/AMDGPU/memory-legalizer-load.ll @@ -5,282 +5,282 @@ declare i32 @llvm.amdgcn.workitem.id.x() -; GCN-LABEL: {{^}}system_unordered: +; GCN-LABEL: {{^}}system_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @system_unordered( +define amdgpu_kernel void @system_one_as_unordered( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in unordered, align 4 + %val = load atomic i32, i32* %in syncscope("one-as") unordered, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}system_monotonic: +; GCN-LABEL: {{^}}system_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @system_monotonic( +define amdgpu_kernel void @system_one_as_monotonic( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in monotonic, align 4 + %val = load atomic i32, i32* %in syncscope("one-as") monotonic, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}system_acquire: +; GCN-LABEL: {{^}}system_one_as_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0){{$}} ; GFX89-NEXT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @system_acquire( +define amdgpu_kernel void @system_one_as_acquire( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in acquire, align 4 + %val = load atomic i32, i32* %in syncscope("one-as") acquire, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}system_seq_cst: +; GCN-LABEL: {{^}}system_one_as_seq_cst: ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0){{$}} ; GFX89-NEXT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @system_seq_cst( +define amdgpu_kernel void @system_one_as_seq_cst( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in seq_cst, align 4 + %val = load atomic i32, i32* %in syncscope("one-as") seq_cst, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}singlethread_unordered: +; GCN-LABEL: {{^}}singlethread_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @singlethread_unordered( +define amdgpu_kernel void @singlethread_one_as_unordered( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4 + %val = load atomic i32, i32* %in syncscope("singlethread-one-as") unordered, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}singlethread_monotonic: +; GCN-LABEL: {{^}}singlethread_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @singlethread_monotonic( +define amdgpu_kernel void @singlethread_one_as_monotonic( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4 + %val = load atomic i32, i32* %in syncscope("singlethread-one-as") monotonic, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}singlethread_acquire: +; GCN-LABEL: {{^}}singlethread_one_as_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @singlethread_acquire( +define amdgpu_kernel void @singlethread_one_as_acquire( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4 + %val = load atomic i32, i32* %in syncscope("singlethread-one-as") acquire, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}singlethread_seq_cst: +; GCN-LABEL: {{^}}singlethread_one_as_seq_cst: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @singlethread_seq_cst( +define amdgpu_kernel void @singlethread_one_as_seq_cst( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4 + %val = load atomic i32, i32* %in syncscope("singlethread-one-as") seq_cst, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}agent_unordered: +; GCN-LABEL: {{^}}agent_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @agent_unordered( +define amdgpu_kernel void @agent_one_as_unordered( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("agent") unordered, align 4 + %val = load atomic i32, i32* %in syncscope("agent-one-as") unordered, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}agent_monotonic: +; GCN-LABEL: {{^}}agent_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @agent_monotonic( +define amdgpu_kernel void @agent_one_as_monotonic( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("agent") monotonic, align 4 + %val = load atomic i32, i32* %in syncscope("agent-one-as") monotonic, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}agent_acquire: +; GCN-LABEL: {{^}}agent_one_as_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0){{$}} ; GFX89-NEXT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @agent_acquire( +define amdgpu_kernel void @agent_one_as_acquire( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("agent") acquire, align 4 + %val = load atomic i32, i32* %in syncscope("agent-one-as") acquire, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}agent_seq_cst: +; GCN-LABEL: {{^}}agent_one_as_seq_cst: ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0){{$}} ; GFX89-NEXT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @agent_seq_cst( +define amdgpu_kernel void @agent_one_as_seq_cst( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("agent") seq_cst, align 4 + %val = load atomic i32, i32* %in syncscope("agent-one-as") seq_cst, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}workgroup_unordered: +; GCN-LABEL: {{^}}workgroup_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @workgroup_unordered( +define amdgpu_kernel void @workgroup_one_as_unordered( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("workgroup") unordered, align 4 + %val = load atomic i32, i32* %in syncscope("workgroup-one-as") unordered, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}workgroup_monotonic: +; GCN-LABEL: {{^}}workgroup_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @workgroup_monotonic( +define amdgpu_kernel void @workgroup_one_as_monotonic( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("workgroup") monotonic, align 4 + %val = load atomic i32, i32* %in syncscope("workgroup-one-as") monotonic, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}workgroup_acquire: +; GCN-LABEL: {{^}}workgroup_one_as_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GFX89-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @workgroup_acquire( +define amdgpu_kernel void @workgroup_one_as_acquire( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("workgroup") acquire, align 4 + %val = load atomic i32, i32* %in syncscope("workgroup-one-as") acquire, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}workgroup_seq_cst: +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst: ; GFX89-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GFX89-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @workgroup_seq_cst( +define amdgpu_kernel void @workgroup_one_as_seq_cst( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("workgroup") seq_cst, align 4 + %val = load atomic i32, i32* %in syncscope("workgroup-one-as") seq_cst, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}wavefront_unordered: +; GCN-LABEL: {{^}}wavefront_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @wavefront_unordered( +define amdgpu_kernel void @wavefront_one_as_unordered( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4 + %val = load atomic i32, i32* %in syncscope("wavefront-one-as") unordered, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}wavefront_monotonic: +; GCN-LABEL: {{^}}wavefront_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @wavefront_monotonic( +define amdgpu_kernel void @wavefront_one_as_monotonic( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4 + %val = load atomic i32, i32* %in syncscope("wavefront-one-as") monotonic, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}wavefront_acquire: +; GCN-LABEL: {{^}}wavefront_one_as_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @wavefront_acquire( +define amdgpu_kernel void @wavefront_one_as_acquire( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4 + %val = load atomic i32, i32* %in syncscope("wavefront-one-as") acquire, align 4 store i32 %val, i32* %out ret void } -; GCN-LABEL: {{^}}wavefront_seq_cst: +; GCN-LABEL: {{^}}wavefront_one_as_seq_cst: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GFX89-NOT: buffer_wbinvl1_vol ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -define amdgpu_kernel void @wavefront_seq_cst( +define amdgpu_kernel void @wavefront_one_as_seq_cst( i32* %in, i32* %out) { entry: - %val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4 + %val = load atomic i32, i32* %in syncscope("wavefront-one-as") seq_cst, align 4 store i32 %val, i32* %out ret void } @@ -374,4 +374,284 @@ ret void } +; GCN-LABEL: {{^}}system_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @system_unordered( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in unordered, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}system_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @system_monotonic( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}system_acquire: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NEXT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @system_acquire( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in acquire, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}system_seq_cst: +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NEXT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @system_seq_cst( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}singlethread_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @singlethread_unordered( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}singlethread_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @singlethread_monotonic( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}singlethread_acquire: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @singlethread_acquire( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}singlethread_seq_cst: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @singlethread_seq_cst( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}agent_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @agent_unordered( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}agent_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @agent_monotonic( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}agent_acquire: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NEXT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @agent_acquire( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}agent_seq_cst: +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NEXT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @agent_seq_cst( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}workgroup_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @workgroup_unordered( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}workgroup_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @workgroup_monotonic( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}workgroup_acquire: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @workgroup_acquire( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}workgroup_seq_cst: +; GFX89-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @workgroup_seq_cst( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}wavefront_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @wavefront_unordered( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}wavefront_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @wavefront_monotonic( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}wavefront_acquire: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @wavefront_acquire( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +; GCN-LABEL: {{^}}wavefront_seq_cst: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @wavefront_seq_cst( + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + !0 = !{i32 1} Index: test/CodeGen/AMDGPU/memory-legalizer-local.mir =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-local.mir +++ test/CodeGen/AMDGPU/memory-legalizer-local.mir @@ -17,7 +17,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") unordered 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -41,7 +41,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") monotonic 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -65,7 +65,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") acquire 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -89,7 +89,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") seq_cst 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -113,7 +113,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") unordered 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -137,7 +137,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") monotonic 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -161,7 +161,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") acquire 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -185,7 +185,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") seq_cst 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -209,7 +209,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") unordered 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -233,7 +233,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") monotonic 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -257,7 +257,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") acquire 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -281,7 +281,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") seq_cst 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -305,7 +305,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent") unordered 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -329,7 +329,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent") monotonic 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -353,7 +353,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent") acquire 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -377,7 +377,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent") seq_cst 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -401,7 +401,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load unordered 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -425,7 +425,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load monotonic 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -449,7 +449,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load acquire 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -473,7 +473,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load seq_cst 4 from `i32 addrspace(3)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -498,7 +498,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") unordered 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -520,7 +520,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") monotonic 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -542,7 +542,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") release 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -564,7 +564,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") seq_cst 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -586,7 +586,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") unordered 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -608,7 +608,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") monotonic 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -630,7 +630,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") release 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -652,7 +652,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") seq_cst 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -674,7 +674,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") unordered 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -696,7 +696,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") monotonic 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -718,7 +718,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") release 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -740,7 +740,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") seq_cst 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -762,7 +762,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent") unordered 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -784,7 +784,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent") monotonic 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -806,7 +806,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent") release 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -828,7 +828,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent") seq_cst 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -850,7 +850,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store unordered 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") unordered 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -872,7 +872,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store monotonic 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") monotonic 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -894,7 +894,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store release 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -916,7 +916,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store seq_cst 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -938,7 +938,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") unordered 4 into `i32 addrspace(3)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -960,7 +960,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") monotonic 4 into `i32 addrspace(3)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -982,7 +982,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") acquire 4 into `i32 addrspace(3)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -1004,7 +1004,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") release 4 into `i32 addrspace(3)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -1026,7 +1026,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") acq_rel 4 into `i32 addrspace(3)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... @@ -1048,7 +1048,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") seq_cst 4 into `i32 addrspace(3)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) S_ENDPGM 0 ... Index: test/CodeGen/AMDGPU/memory-legalizer-mesa3d.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-mesa3d.ll +++ test/CodeGen/AMDGPU/memory-legalizer-mesa3d.ll @@ -3,10 +3,228 @@ ; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s ; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s -; FUNC-LABEL: {{^}}system_acquire: +; FUNC-LABEL: {{^}}system_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_acquire() { +entry: + fence syncscope("one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}system_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_release() { +entry: + fence syncscope("one-as") release + ret void +} + +; FUNC-LABEL: {{^}}system_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_acq_rel() { +entry: + fence syncscope("one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}system_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_one_as_seq_cst() { +entry: + fence syncscope("one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_acquire() { +entry: + fence syncscope("singlethread-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_release() { +entry: + fence syncscope("singlethread-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_acq_rel() { +entry: + fence syncscope("singlethread-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}singlethread_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_one_as_seq_cst() { +entry: + fence syncscope("singlethread-one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN-NEXT: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_acquire() { +entry: + fence syncscope("agent-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_release() { +entry: + fence syncscope("agent-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_acq_rel() { +entry: + fence syncscope("agent-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}agent_one_as_seq_cst: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: buffer_wbinvl1{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_one_as_seq_cst() { +entry: + fence syncscope("agent-one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_acquire() { +entry: + fence syncscope("workgroup-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_release: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_release() { +entry: + fence syncscope("workgroup-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_acq_rel() { +entry: + fence syncscope("workgroup-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}workgroup_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_one_as_seq_cst() { +entry: + fence syncscope("workgroup-one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_acquire() { +entry: + fence syncscope("wavefront-one-as") acquire + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_release: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_release() { +entry: + fence syncscope("wavefront-one-as") release + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_acq_rel: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_acq_rel() { +entry: + fence syncscope("wavefront-one-as") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}wavefront_one_as_seq_cst: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_one_as_seq_cst() { +entry: + fence syncscope("wavefront-one-as") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}system_acquire: +; GCN: %bb.0 +; GCN-NOT: ATOMIC_FENCE +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @system_acquire() { @@ -18,7 +236,7 @@ ; FUNC-LABEL: {{^}}system_release: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: s_endpgm define amdgpu_kernel void @system_release() { entry: @@ -29,7 +247,7 @@ ; FUNC-LABEL: {{^}}system_acq_rel: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @system_acq_rel() { @@ -41,7 +259,7 @@ ; FUNC-LABEL: {{^}}system_seq_cst: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @system_seq_cst() { @@ -93,7 +311,7 @@ ; FUNC-LABEL: {{^}}agent_acquire: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NEXT: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_acquire() { @@ -105,7 +323,7 @@ ; FUNC-LABEL: {{^}}agent_release: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_release() { entry: @@ -116,7 +334,7 @@ ; FUNC-LABEL: {{^}}agent_acq_rel: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_acq_rel() { @@ -128,7 +346,7 @@ ; FUNC-LABEL: {{^}}agent_seq_cst: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN: buffer_wbinvl1{{$}} ; GCN: s_endpgm define amdgpu_kernel void @agent_seq_cst() { @@ -139,7 +357,7 @@ ; FUNC-LABEL: {{^}}workgroup_acquire: ; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_acquire() { @@ -150,7 +368,7 @@ ; FUNC-LABEL: {{^}}workgroup_release: ; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_release() { @@ -161,7 +379,7 @@ ; FUNC-LABEL: {{^}}workgroup_acq_rel: ; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_acq_rel() { @@ -172,7 +390,7 @@ ; FUNC-LABEL: {{^}}workgroup_seq_cst: ; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm define amdgpu_kernel void @workgroup_seq_cst() { Index: test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir +++ test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir @@ -55,7 +55,7 @@ S_WAITCNT 127 $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc $vgpr0 = V_ADD_I32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent") unordered 4 from `i32 addrspace(1)* undef`), (load syncscope("workgroup") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent-one-as") unordered 4 from `i32 addrspace(1)* undef`), (load syncscope("workgroup-one-as") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`) $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec S_WAITCNT 3952 Index: test/CodeGen/AMDGPU/memory-legalizer-region.mir =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-region.mir +++ test/CodeGen/AMDGPU/memory-legalizer-region.mir @@ -17,7 +17,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 1, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") unordered 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 1, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -41,7 +41,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") monotonic 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -65,7 +65,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") acquire 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -89,7 +89,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread") seq_cst 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -113,7 +113,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") unordered 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -137,7 +137,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") monotonic 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -161,7 +161,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") acquire 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -185,7 +185,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront") seq_cst 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -209,7 +209,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") unordered 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -233,7 +233,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") monotonic 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -257,7 +257,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") acquire 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -281,7 +281,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup") seq_cst 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -305,7 +305,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent") unordered 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -329,7 +329,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent") monotonic 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -353,7 +353,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent") acquire 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -377,7 +377,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent") seq_cst 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -401,7 +401,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load unordered 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -425,7 +425,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load monotonic 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -449,7 +449,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load acquire 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -473,7 +473,7 @@ $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) $m0 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load seq_cst 4 from `i32 addrspace(2)* undef`) + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) @@ -498,7 +498,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") unordered 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -520,7 +520,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") monotonic 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -542,7 +542,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") release 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -564,7 +564,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") seq_cst 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -586,7 +586,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") unordered 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -608,7 +608,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") monotonic 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -630,7 +630,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") release 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -652,7 +652,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront") seq_cst 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -674,7 +674,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") unordered 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -696,7 +696,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") monotonic 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -718,7 +718,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") release 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -740,7 +740,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup") seq_cst 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -762,7 +762,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent") unordered 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... --- @@ -783,7 +783,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent") monotonic 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -805,7 +805,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent") release 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -827,7 +827,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent") seq_cst 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -893,7 +893,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store release 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -915,7 +915,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store seq_cst 4 into `i32 addrspace(2)* undef`) + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -937,7 +937,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") unordered 4 into `i32 addrspace(2)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -959,7 +959,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") monotonic 4 into `i32 addrspace(2)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -981,7 +981,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") acquire 4 into `i32 addrspace(2)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -1003,7 +1003,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") release 4 into `i32 addrspace(2)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -1025,7 +1025,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") acq_rel 4 into `i32 addrspace(2)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... @@ -1047,7 +1047,7 @@ $m0 = S_MOV_B32 -1 $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread") seq_cst 4 into `i32 addrspace(2)* undef`) + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) S_ENDPGM 0 ... Index: test/CodeGen/AMDGPU/memory-legalizer-store.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-store.ll +++ test/CodeGen/AMDGPU/memory-legalizer-store.ll @@ -5,203 +5,203 @@ declare i32 @llvm.amdgcn.workitem.id.x() -; GCN-LABEL: {{^}}system_unordered: +; GCN-LABEL: {{^}}system_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @system_unordered( +define amdgpu_kernel void @system_one_as_unordered( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out unordered, align 4 + store atomic i32 %in, i32* %out syncscope("one-as") unordered, align 4 ret void } -; GCN-LABEL: {{^}}system_monotonic: +; GCN-LABEL: {{^}}system_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @system_monotonic( +define amdgpu_kernel void @system_one_as_monotonic( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out monotonic, align 4 + store atomic i32 %in, i32* %out syncscope("one-as") monotonic, align 4 ret void } -; GCN-LABEL: {{^}}system_release: +; GCN-LABEL: {{^}}system_one_as_release: ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @system_release( +define amdgpu_kernel void @system_one_as_release( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out release, align 4 + store atomic i32 %in, i32* %out syncscope("one-as") release, align 4 ret void } -; GCN-LABEL: {{^}}system_seq_cst: +; GCN-LABEL: {{^}}system_one_as_seq_cst: ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @system_seq_cst( +define amdgpu_kernel void @system_one_as_seq_cst( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out seq_cst, align 4 + store atomic i32 %in, i32* %out syncscope("one-as") seq_cst, align 4 ret void } -; GCN-LABEL: {{^}}singlethread_unordered: +; GCN-LABEL: {{^}}singlethread_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @singlethread_unordered( +define amdgpu_kernel void @singlethread_one_as_unordered( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4 + store atomic i32 %in, i32* %out syncscope("singlethread-one-as") unordered, align 4 ret void } -; GCN-LABEL: {{^}}singlethread_monotonic: +; GCN-LABEL: {{^}}singlethread_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @singlethread_monotonic( +define amdgpu_kernel void @singlethread_one_as_monotonic( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4 + store atomic i32 %in, i32* %out syncscope("singlethread-one-as") monotonic, align 4 ret void } -; GCN-LABEL: {{^}}singlethread_release: +; GCN-LABEL: {{^}}singlethread_one_as_release: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @singlethread_release( +define amdgpu_kernel void @singlethread_one_as_release( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4 + store atomic i32 %in, i32* %out syncscope("singlethread-one-as") release, align 4 ret void } -; GCN-LABEL: {{^}}singlethread_seq_cst: +; GCN-LABEL: {{^}}singlethread_one_as_seq_cst: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @singlethread_seq_cst( +define amdgpu_kernel void @singlethread_one_as_seq_cst( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4 + store atomic i32 %in, i32* %out syncscope("singlethread-one-as") seq_cst, align 4 ret void } -; GCN-LABEL: {{^}}agent_unordered: +; GCN-LABEL: {{^}}agent_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @agent_unordered( +define amdgpu_kernel void @agent_one_as_unordered( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("agent") unordered, align 4 + store atomic i32 %in, i32* %out syncscope("agent-one-as") unordered, align 4 ret void } -; GCN-LABEL: {{^}}agent_monotonic: +; GCN-LABEL: {{^}}agent_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @agent_monotonic( +define amdgpu_kernel void @agent_one_as_monotonic( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("agent") monotonic, align 4 + store atomic i32 %in, i32* %out syncscope("agent-one-as") monotonic, align 4 ret void } -; GCN-LABEL: {{^}}agent_release: +; GCN-LABEL: {{^}}agent_one_as_release: ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @agent_release( +define amdgpu_kernel void @agent_one_as_release( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("agent") release, align 4 + store atomic i32 %in, i32* %out syncscope("agent-one-as") release, align 4 ret void } -; GCN-LABEL: {{^}}agent_seq_cst: +; GCN-LABEL: {{^}}agent_one_as_seq_cst: ; GCN: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @agent_seq_cst( +define amdgpu_kernel void @agent_one_as_seq_cst( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("agent") seq_cst, align 4 + store atomic i32 %in, i32* %out syncscope("agent-one-as") seq_cst, align 4 ret void } -; GCN-LABEL: {{^}}workgroup_unordered: +; GCN-LABEL: {{^}}workgroup_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @workgroup_unordered( +define amdgpu_kernel void @workgroup_one_as_unordered( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("workgroup") unordered, align 4 + store atomic i32 %in, i32* %out syncscope("workgroup-one-as") unordered, align 4 ret void } -; GCN-LABEL: {{^}}workgroup_monotonic: +; GCN-LABEL: {{^}}workgroup_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @workgroup_monotonic( +define amdgpu_kernel void @workgroup_one_as_monotonic( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("workgroup") monotonic, align 4 + store atomic i32 %in, i32* %out syncscope("workgroup-one-as") monotonic, align 4 ret void } -; GCN-LABEL: {{^}}workgroup_release: +; GCN-LABEL: {{^}}workgroup_one_as_release: ; GFX89-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @workgroup_release( +define amdgpu_kernel void @workgroup_one_as_release( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("workgroup") release, align 4 + store atomic i32 %in, i32* %out syncscope("workgroup-one-as") release, align 4 ret void } -; GCN-LABEL: {{^}}workgroup_seq_cst: +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst: ; GFX89-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @workgroup_seq_cst( +define amdgpu_kernel void @workgroup_one_as_seq_cst( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("workgroup") seq_cst, align 4 + store atomic i32 %in, i32* %out syncscope("workgroup-one-as") seq_cst, align 4 ret void } -; GCN-LABEL: {{^}}wavefront_unordered: +; GCN-LABEL: {{^}}wavefront_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @wavefront_unordered( +define amdgpu_kernel void @wavefront_one_as_unordered( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4 + store atomic i32 %in, i32* %out syncscope("wavefront-one-as") unordered, align 4 ret void } -; GCN-LABEL: {{^}}wavefront_monotonic: +; GCN-LABEL: {{^}}wavefront_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @wavefront_monotonic( +define amdgpu_kernel void @wavefront_one_as_monotonic( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4 + store atomic i32 %in, i32* %out syncscope("wavefront-one-as") monotonic, align 4 ret void } -; GCN-LABEL: {{^}}wavefront_release: +; GCN-LABEL: {{^}}wavefront_one_as_release: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @wavefront_release( +define amdgpu_kernel void @wavefront_one_as_release( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4 + store atomic i32 %in, i32* %out syncscope("wavefront-one-as") release, align 4 ret void } -; GCN-LABEL: {{^}}wavefront_seq_cst: +; GCN-LABEL: {{^}}wavefront_one_as_seq_cst: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -define amdgpu_kernel void @wavefront_seq_cst( +define amdgpu_kernel void @wavefront_one_as_seq_cst( i32 %in, i32* %out) { entry: - store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4 + store atomic i32 %in, i32* %out syncscope("wavefront-one-as") seq_cst, align 4 ret void } @@ -295,4 +295,204 @@ ret void } +; GCN-LABEL: {{^}}system_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @system_unordered( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out unordered, align 4 + ret void +} + +; GCN-LABEL: {{^}}system_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @system_monotonic( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}system_release: +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @system_release( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out release, align 4 + ret void +} + +; GCN-LABEL: {{^}}system_seq_cst: +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @system_seq_cst( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out seq_cst, align 4 + ret void +} + +; GCN-LABEL: {{^}}singlethread_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @singlethread_unordered( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4 + ret void +} + +; GCN-LABEL: {{^}}singlethread_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @singlethread_monotonic( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}singlethread_release: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @singlethread_release( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4 + ret void +} + +; GCN-LABEL: {{^}}singlethread_seq_cst: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @singlethread_seq_cst( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @agent_unordered( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent") unordered, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @agent_monotonic( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent") monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_release: +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @agent_release( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent") release, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_seq_cst: +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @agent_seq_cst( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent") seq_cst, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @workgroup_unordered( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup") unordered, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @workgroup_monotonic( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup") monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_release: +; GFX89-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @workgroup_release( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup") release, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_seq_cst: +; GFX89-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @workgroup_seq_cst( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup") seq_cst, align 4 + ret void +} + +; GCN-LABEL: {{^}}wavefront_unordered: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @wavefront_unordered( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4 + ret void +} + +; GCN-LABEL: {{^}}wavefront_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @wavefront_monotonic( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4 + ret void +} + +; GCN-LABEL: {{^}}wavefront_release: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @wavefront_release( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4 + ret void +} + +; GCN-LABEL: {{^}}wavefront_seq_cst: +; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @wavefront_seq_cst( + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4 + ret void +} + !0 = !{i32 1}