Index: lib/Target/AMDGPU/SIMemoryLegalizer.cpp =================================================================== --- lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -352,6 +352,40 @@ }; +class SIGfx10CacheControl : public SIGfx7CacheControl { +protected: + bool CuMode = false; + + /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI + /// is modified, false otherwise. + bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { + return enableNamedBit(MI); + } + +public: + + SIGfx10CacheControl(const GCNSubtarget &ST, bool CuMode) : + SIGfx7CacheControl(ST), CuMode(CuMode) {}; + + bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override; + + bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const override; + + bool insertWait(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + SIMemOp Op, + bool IsCrossAddrSpaceOrdering, + Position Pos) const override; +}; + class SIMemoryLegalizer final : public MachineFunctionPass { private: @@ -623,7 +657,9 @@ GCNSubtarget::Generation Generation = ST.getGeneration(); if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) return make_unique(ST); - return make_unique(ST); + if (Generation < AMDGPUSubtarget::GFX10) + return make_unique(ST); + return make_unique(ST, ST.isCuModeEnabled()); } bool SIGfx6CacheControl::enableLoadCacheBypass( @@ -860,6 +896,231 @@ return Changed; } +bool SIGfx10CacheControl::enableLoadCacheBypass( + const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(MI->mayLoad() && !MI->mayStore()); + bool Changed = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + /// TODO Do not set glc for rmw atomic operations as they + /// implicitly bypass the L0/L1 caches. + + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + Changed |= enableGLCBit(MI); + Changed |= enableDLCBit(MI); + break; + case SIAtomicScope::WORKGROUP: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in + // CU mode and all waves of a work-group are on the same CU, and so the + // L0 does not need to be bypassed. + if (!CuMode) Changed |= enableGLCBit(MI); + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to bypass. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not hava a cache. + + return Changed; +} + +bool SIGfx10CacheControl::enableNonTemporal( + const MachineBasicBlock::iterator &MI) const { + assert(MI->mayLoad() ^ MI->mayStore()); + bool Changed = false; + + Changed |= enableSLCBit(MI); + /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI) + + return Changed; +} + +bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const { + bool Changed = false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (Pos == Position::AFTER) + ++MI; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); + Changed = true; + break; + case SIAtomicScope::WORKGROUP: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise + // in CU mode and all waves of a work-group are on the same CU, and so the + // L0 does not need to be invalidated. + if (!CuMode) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); + Changed = true; + } + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to invalidate. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory cache + /// to be flushed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not hava a cache. + + if (Pos == Position::AFTER) + --MI; + + return Changed; +} + +bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + SIMemOp Op, + bool IsCrossAddrSpaceOrdering, + Position Pos) const { + bool Changed = false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (Pos == Position::AFTER) + ++MI; + + bool VMCnt = false; + bool VSCnt = false; + bool LGKMCnt = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) + VMCnt |= true; + if ((Op & SIMemOp::STORE) != SIMemOp::NONE) + VSCnt |= true; + break; + case SIAtomicScope::WORKGROUP: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore need to wait for operations to complete to ensure + // they are visible to waves in the other CU as the L0 is per CU. + // Otherwise in CU mode and all waves of a work-group are on the same CU + // which shares the same L0. + if (!CuMode) { + if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) + VMCnt |= true; + if ((Op & SIMemOp::STORE) != SIMemOp::NONE) + VSCnt |= true; + } + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // The L0 cache keeps all memory operations in order for + // work-items in the same wavefront. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + case SIAtomicScope::WORKGROUP: + // If no cross address space ordering then an LDS waitcnt is not + // needed as LDS operations for all waves are executed in a + // total global ordering as observed by all waves. Required if + // also synchronizing with global/GDS memory as LDS operations + // could be reordered with respect to later global/GDS memory + // operations of the same wave. + LGKMCnt |= IsCrossAddrSpaceOrdering; + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // The LDS keeps all memory operations in order for + // the same wavesfront. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + // If no cross address space ordering then an GDS waitcnt is not + // needed as GDS operations for all waves are executed in a + // total global ordering as observed by all waves. Required if + // also synchronizing with global/LDS memory as GDS operations + // could be reordered with respect to later global/LDS memory + // operations of the same wave. + LGKMCnt |= IsCrossAddrSpaceOrdering; + break; + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // The GDS keeps all memory operations in order for + // the same work-group. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if (VMCnt || LGKMCnt) { + unsigned WaitCntImmediate = + AMDGPU::encodeWaitcnt(IV, + VMCnt ? 0 : getVmcntBitMask(IV), + getExpcntBitMask(IV), + LGKMCnt ? 0 : getLgkmcntBitMask(IV)); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); + Changed = true; + } + + if (VSCnt) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); + Changed = true; + } + + if (Pos == Position::AFTER) + --MI; + + return Changed; +} + bool SIMemoryLegalizer::removeAtomicPseudoMIs() { if (AtomicPseudoMIs.empty()) return false; Index: test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll +++ test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll @@ -1,141 +1,20 @@ ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s - -; GCN-LABEL: {{^}}system_monotonic_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @system_monotonic_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic monotonic - ret void -} - -; GCN-LABEL: {{^}}system_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @system_acquire_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire monotonic - ret void -} - -; GCN-LABEL: {{^}}system_release_monotonic: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @system_release_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release monotonic - ret void -} - -; GCN-LABEL: {{^}}system_acq_rel_monotonic: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @system_acq_rel_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel monotonic - ret void -} - -; GCN-LABEL: {{^}}system_seq_cst_monotonic: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @system_seq_cst_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst monotonic - ret void -} - -; GCN-LABEL: {{^}}system_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @system_acquire_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire acquire - ret void -} - -; GCN-LABEL: {{^}}system_release_acquire: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @system_release_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release acquire - ret void -} - -; GCN-LABEL: {{^}}system_acq_rel_acquire: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @system_acq_rel_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel acquire - ret void -} - -; GCN-LABEL: {{^}}system_seq_cst_acquire: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @system_seq_cst_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst acquire - ret void -} - -; GCN-LABEL: {{^}}system_seq_cst_seq_cst: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @system_seq_cst_seq_cst( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst - ret void -} +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10CU %s ; GCN-LABEL: {{^}}system_one_as_monotonic_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv +; GFX10: .amdhsa_kernel system_one_as_monotonic_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_monotonic_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -146,9 +25,17 @@ ; GCN-LABEL: {{^}}system_one_as_acquire_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_acquire_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_acquire_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -159,9 +46,16 @@ ; GCN-LABEL: {{^}}system_one_as_release_monotonic: ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl._inv +; GFX10: .amdhsa_kernel system_one_as_release_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_release_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -172,9 +66,17 @@ ; GCN-LABEL: {{^}}system_one_as_acq_rel_monotonic: ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_acq_rel_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_acq_rel_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -185,9 +87,17 @@ ; GCN-LABEL: {{^}}system_one_as_seq_cst_monotonic: ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_seq_cst_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_seq_cst_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -198,9 +108,17 @@ ; GCN-LABEL: {{^}}system_one_as_acquire_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_acquire_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_acquire_acquire( i32* %out, i32 %in, i32 %old) { entry: @@ -211,9 +129,17 @@ ; GCN-LABEL: {{^}}system_one_as_release_acquire: ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_release_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_release_acquire( i32* %out, i32 %in, i32 %old) { entry: @@ -224,9 +150,17 @@ ; GCN-LABEL: {{^}}system_one_as_acq_rel_acquire: ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_acq_rel_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_acq_rel_acquire( i32* %out, i32 %in, i32 %old) { entry: @@ -237,9 +171,17 @@ ; GCN-LABEL: {{^}}system_one_as_seq_cst_acquire: ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_seq_cst_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_seq_cst_acquire( i32* %out, i32 %in, i32 %old) { entry: @@ -250,9 +192,17 @@ ; GCN-LABEL: {{^}}system_one_as_seq_cst_seq_cst: ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_seq_cst_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_seq_cst_seq_cst( i32* %out, i32 %in, i32 %old) { entry: @@ -261,141 +211,18 @@ ret void } -; GCN-LABEL: {{^}}singlethread_monotonic_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @singlethread_monotonic_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic - ret void -} - -; GCN-LABEL: {{^}}singlethread_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @singlethread_acquire_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic - ret void -} - -; GCN-LABEL: {{^}}singlethread_release_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @singlethread_release_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic - ret void -} - -; GCN-LABEL: {{^}}singlethread_acq_rel_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @singlethread_acq_rel_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic - ret void -} - -; GCN-LABEL: {{^}}singlethread_seq_cst_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @singlethread_seq_cst_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic - ret void -} - -; GCN-LABEL: {{^}}singlethread_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @singlethread_acquire_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire - ret void -} - -; GCN-LABEL: {{^}}singlethread_release_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @singlethread_release_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire - ret void -} - -; GCN-LABEL: {{^}}singlethread_acq_rel_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @singlethread_acq_rel_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire - ret void -} - -; GCN-LABEL: {{^}}singlethread_seq_cst_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @singlethread_seq_cst_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire - ret void -} - -; GCN-LABEL: {{^}}singlethread_seq_cst_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @singlethread_seq_cst_seq_cst( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst - ret void -} - ; GCN-LABEL: {{^}}singlethread_one_as_monotonic_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv +; GFX10: .amdhsa_kernel singlethread_one_as_monotonic_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_monotonic_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -406,9 +233,16 @@ ; GCN-LABEL: {{^}}singlethread_one_as_acquire_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv +; GFX10: .amdhsa_kernel singlethread_one_as_acquire_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_acquire_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -419,9 +253,16 @@ ; GCN-LABEL: {{^}}singlethread_one_as_release_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; Gfx8-NOT: buffer_wbinvl1_vol +; GCN-NOT: buffer_gl{{[01]}}_inv +; GFX10: .amdhsa_kernel singlethread_one_as_release_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_release_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -432,9 +273,16 @@ ; GCN-LABEL: {{^}}singlethread_one_as_acq_rel_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel singlethread_one_as_acq_rel_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_acq_rel_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -445,9 +293,16 @@ ; GCN-LABEL: {{^}}singlethread_one_as_seq_cst_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel singlethread_one_as_seq_cst_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_seq_cst_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -458,9 +313,16 @@ ; GCN-LABEL: {{^}}singlethread_one_as_acquire_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel singlethread_one_as_acquire_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_acquire_acquire( i32* %out, i32 %in, i32 %old) { entry: @@ -471,9 +333,16 @@ ; GCN-LABEL: {{^}}singlethread_one_as_release_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel singlethread_one_as_release_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_release_acquire( i32* %out, i32 %in, i32 %old) { entry: @@ -484,9 +353,16 @@ ; GCN-LABEL: {{^}}singlethread_one_as_acq_rel_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel singlethread_one_as_acq_rel_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_acq_rel_acquire( i32* %out, i32 %in, i32 %old) { entry: @@ -497,9 +373,16 @@ ; GCN-LABEL: {{^}}singlethread_one_as_seq_cst_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel singlethread_one_as_seq_cst_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_seq_cst_acquire( i32* %out, i32 %in, i32 %old) { entry: @@ -510,9 +393,16 @@ ; GCN-LABEL: {{^}}singlethread_one_as_seq_cst_seq_cst: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel singlethread_one_as_seq_cst_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_seq_cst_seq_cst( i32* %out, i32 %in, i32 %old) { entry: @@ -521,531 +411,2112 @@ ret void } -; GCN-LABEL: {{^}}agent_monotonic_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-LABEL: {{^}}agent_one_as_monotonic_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_monotonic_monotonic( +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel agent_one_as_monotonic_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_monotonic_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic ret void } -; GCN-LABEL: {{^}}agent_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-LABEL: {{^}}agent_one_as_acquire_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_acquire_monotonic( +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_acquire_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_acquire_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic ret void } -; GCN-LABEL: {{^}}agent_release_monotonic: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-LABEL: {{^}}agent_one_as_release_monotonic: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_release_monotonic( +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel agent_one_as_release_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_release_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic ret void } -; GCN-LABEL: {{^}}agent_acq_rel_monotonic: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-LABEL: {{^}}agent_one_as_acq_rel_monotonic: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_acq_rel_monotonic( +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_acq_rel_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_acq_rel_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic ret void } -; GCN-LABEL: {{^}}agent_seq_cst_monotonic: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-LABEL: {{^}}agent_one_as_seq_cst_monotonic: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_seq_cst_monotonic( +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_seq_cst_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_seq_cst_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic ret void } -; GCN-LABEL: {{^}}agent_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-LABEL: {{^}}agent_one_as_acquire_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_acquire_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_acquire_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_release_acquire: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_release_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_release_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_acq_rel_acquire: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_acq_rel_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_acq_rel_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_seq_cst_acquire: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_seq_cst_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_seq_cst_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_seq_cst_seq_cst: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_seq_cst_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_seq_cst_seq_cst( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_monotonic_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel workgroup_one_as_monotonic_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_monotonic_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acquire_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: buffer_gl0_inv +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10: .amdhsa_kernel workgroup_one_as_acquire_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_acquire_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_release_monotonic: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel workgroup_one_as_release_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_release_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acq_rel_monotonic: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_one_as_acq_rel_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_acq_rel_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_monotonic: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_seq_cst_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acquire_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: buffer_gl0_inv +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10: .amdhsa_kernel workgroup_one_as_acquire_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_acquire_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_release_acquire: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_one_as_release_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_release_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acq_rel_acquire: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_one_as_acq_rel_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_acq_rel_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_acquire: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_seq_cst_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_seq_cst: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_seq_cst_seq_cst( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_monotonic_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_one_as_monotonic_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @wavefront_one_as_monotonic_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_acquire_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_one_as_acquire_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @wavefront_one_as_acquire_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_release_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_one_as_release_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @wavefront_one_as_release_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_acq_rel_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_one_as_acq_rel_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @wavefront_one_as_acq_rel_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_one_as_seq_cst_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @wavefront_one_as_seq_cst_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_acquire_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_one_as_acquire_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @wavefront_one_as_acquire_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_release_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_one_as_release_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @wavefront_one_as_release_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_acq_rel_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_one_as_acq_rel_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @wavefront_one_as_acq_rel_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_one_as_seq_cst_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @wavefront_one_as_seq_cst_acquire( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + ret void +} + +; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_seq_cst: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_one_as_seq_cst_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @wavefront_one_as_seq_cst_seq_cst( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst + ret void +} + +; GCN-LABEL: {{^}}system_one_as_acquire_monotonic_ret: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_acquire_monotonic_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_one_as_acquire_monotonic_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}system_one_as_acq_rel_monotonic_ret: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_acq_rel_monotonic_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_one_as_acq_rel_monotonic_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}system_one_as_seq_cst_monotonic_ret: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_seq_cst_monotonic_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_one_as_seq_cst_monotonic_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}system_one_as_acquire_acquire_ret: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_acquire_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_one_as_acquire_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}system_one_as_release_acquire_ret: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_release_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_one_as_release_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}system_one_as_acq_rel_acquire_ret: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_acq_rel_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_one_as_acq_rel_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}system_one_as_seq_cst_acquire_ret: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_seq_cst_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_one_as_seq_cst_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}system_one_as_seq_cst_seq_cst_ret: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_seq_cst_seq_cst_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_one_as_seq_cst_seq_cst_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_acquire_monotonic_ret: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_acquire_monotonic_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_acquire_monotonic_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_acq_rel_monotonic_ret: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_acq_rel_monotonic_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_acq_rel_monotonic_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_seq_cst_monotonic_ret: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_seq_cst_monotonic_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_seq_cst_monotonic_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_acquire_acquire_ret: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_acquire_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_acquire_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_release_acquire_ret: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_release_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_release_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_acq_rel_acquire_ret: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_acq_rel_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_acq_rel_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_seq_cst_acquire_ret: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_seq_cst_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_seq_cst_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_seq_cst_seq_cst_ret: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_seq_cst_seq_cst_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_seq_cst_seq_cst_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acquire_monotonic_ret: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: buffer_gl0_inv +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10: .amdhsa_kernel workgroup_one_as_acquire_monotonic_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_acquire_monotonic_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acq_rel_monotonic_ret: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_one_as_acq_rel_monotonic_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_acq_rel_monotonic_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_monotonic_ret: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst_monotonic_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_seq_cst_monotonic_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acquire_acquire_ret: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: buffer_gl0_inv +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10: .amdhsa_kernel workgroup_one_as_acquire_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_acquire_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_release_acquire_ret: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_one_as_release_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_release_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acq_rel_acquire_ret: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_one_as_acq_rel_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_acq_rel_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_acquire_ret: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_seq_cst_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_seq_cst_ret: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst_seq_cst_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_seq_cst_seq_cst_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}system_monotonic_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv +; GFX10: .amdhsa_kernel system_monotonic_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_monotonic_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic monotonic + ret void +} + +; GCN-LABEL: {{^}}system_acquire_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_acquire_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_acquire_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire monotonic + ret void +} + +; GCN-LABEL: {{^}}system_release_monotonic: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl._inv +; GFX10: .amdhsa_kernel system_release_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_release_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release monotonic + ret void +} + +; GCN-LABEL: {{^}}system_acq_rel_monotonic: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_acq_rel_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_acq_rel_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel monotonic + ret void +} + +; GCN-LABEL: {{^}}system_seq_cst_monotonic: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_seq_cst_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_seq_cst_monotonic( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst monotonic + ret void +} + +; GCN-LABEL: {{^}}system_acquire_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_acquire_acquire( +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_acquire_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_acquire_acquire( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire acquire ret void } -; GCN-LABEL: {{^}}agent_release_acquire: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-LABEL: {{^}}system_release_acquire: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_release_acquire( +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_release_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_release_acquire( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release acquire ret void } -; GCN-LABEL: {{^}}agent_acq_rel_acquire: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-LABEL: {{^}}system_acq_rel_acquire: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_acq_rel_acquire( +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_acq_rel_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_acq_rel_acquire( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel acquire ret void } -; GCN-LABEL: {{^}}agent_seq_cst_acquire: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-LABEL: {{^}}system_seq_cst_acquire: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_seq_cst_acquire( +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_seq_cst_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_seq_cst_acquire( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst acquire ret void } -; GCN-LABEL: {{^}}agent_seq_cst_seq_cst: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-LABEL: {{^}}system_seq_cst_seq_cst: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_seq_cst_seq_cst( +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_seq_cst_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_seq_cst_seq_cst( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst ret void } -; GCN-LABEL: {{^}}agent_one_as_monotonic_monotonic: +; GCN-LABEL: {{^}}singlethread_monotonic_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_one_as_monotonic_monotonic( +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv +; GFX10: .amdhsa_kernel singlethread_monotonic_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @singlethread_monotonic_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic ret void } -; GCN-LABEL: {{^}}agent_one_as_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_one_as_acquire_monotonic( +; GCN-LABEL: {{^}}singlethread_acquire_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv +; GFX10: .amdhsa_kernel singlethread_acquire_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @singlethread_acquire_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic ret void } -; GCN-LABEL: {{^}}agent_one_as_release_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_one_as_release_monotonic( +; GCN-LABEL: {{^}}singlethread_release_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; Gfx8-NOT: buffer_wbinvl1_vol +; GCN-NOT: buffer_gl{{[01]}}_inv +; GFX10: .amdhsa_kernel singlethread_release_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @singlethread_release_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic ret void } -; GCN-LABEL: {{^}}agent_one_as_acq_rel_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_one_as_acq_rel_monotonic( +; GCN-LABEL: {{^}}singlethread_acq_rel_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel singlethread_acq_rel_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @singlethread_acq_rel_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic ret void } -; GCN-LABEL: {{^}}agent_one_as_seq_cst_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_one_as_seq_cst_monotonic( +; GCN-LABEL: {{^}}singlethread_seq_cst_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel singlethread_seq_cst_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @singlethread_seq_cst_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic ret void } -; GCN-LABEL: {{^}}agent_one_as_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_one_as_acquire_acquire( +; GCN-LABEL: {{^}}singlethread_acquire_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel singlethread_acquire_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @singlethread_acquire_acquire( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire ret void } -; GCN-LABEL: {{^}}agent_one_as_release_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_one_as_release_acquire( +; GCN-LABEL: {{^}}singlethread_release_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel singlethread_release_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @singlethread_release_acquire( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire ret void } -; GCN-LABEL: {{^}}agent_one_as_acq_rel_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_one_as_acq_rel_acquire( +; GCN-LABEL: {{^}}singlethread_acq_rel_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel singlethread_acq_rel_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @singlethread_acq_rel_acquire( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire ret void } -; GCN-LABEL: {{^}}agent_one_as_seq_cst_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_one_as_seq_cst_acquire( +; GCN-LABEL: {{^}}singlethread_seq_cst_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel singlethread_seq_cst_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @singlethread_seq_cst_acquire( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire ret void } -; GCN-LABEL: {{^}}agent_one_as_seq_cst_seq_cst: -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -define amdgpu_kernel void @agent_one_as_seq_cst_seq_cst( +; GCN-LABEL: {{^}}singlethread_seq_cst_seq_cst: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel singlethread_seq_cst_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @singlethread_seq_cst_seq_cst( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst ret void } -; GCN-LABEL: {{^}}workgroup_monotonic_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-LABEL: {{^}}agent_monotonic_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_monotonic_monotonic( +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel agent_monotonic_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_monotonic_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic ret void } -; GCN-LABEL: {{^}}workgroup_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-LABEL: {{^}}agent_acquire_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_acquire_monotonic( +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_acquire_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_acquire_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic ret void } -; GCN-LABEL: {{^}}workgroup_release_monotonic: -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_release_monotonic( +; GCN-LABEL: {{^}}agent_release_monotonic: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel agent_release_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_release_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release monotonic ret void } -; GCN-LABEL: {{^}}workgroup_acq_rel_monotonic: -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_acq_rel_monotonic( +; GCN-LABEL: {{^}}agent_acq_rel_monotonic: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_acq_rel_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_acq_rel_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic ret void } -; GCN-LABEL: {{^}}workgroup_seq_cst_monotonic: -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_seq_cst_monotonic( +; GCN-LABEL: {{^}}agent_seq_cst_monotonic: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_seq_cst_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_seq_cst_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic ret void } -; GCN-LABEL: {{^}}workgroup_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-LABEL: {{^}}agent_acquire_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_acquire_acquire( +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_acquire_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_acquire_acquire( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire ret void } -; GCN-LABEL: {{^}}workgroup_release_acquire: -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_release_acquire( +; GCN-LABEL: {{^}}agent_release_acquire: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_release_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_release_acquire( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release acquire ret void } -; GCN-LABEL: {{^}}workgroup_acq_rel_acquire: -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_acq_rel_acquire( +; GCN-LABEL: {{^}}agent_acq_rel_acquire: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_acq_rel_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_acq_rel_acquire( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire ret void } -; GCN-LABEL: {{^}}workgroup_seq_cst_acquire: -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_seq_cst_acquire( +; GCN-LABEL: {{^}}agent_seq_cst_acquire: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_seq_cst_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_seq_cst_acquire( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire ret void } -; GCN-LABEL: {{^}}workgroup_seq_cst_seq_cst: -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_seq_cst_seq_cst( +; GCN-LABEL: {{^}}agent_seq_cst_seq_cst: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_seq_cst_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_seq_cst_seq_cst( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst ret void } -; GCN-LABEL: {{^}}workgroup_one_as_monotonic_monotonic: +; GCN-LABEL: {{^}}workgroup_monotonic_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_one_as_monotonic_monotonic( +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel workgroup_monotonic_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_monotonic_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic ret void } -; GCN-LABEL: {{^}}workgroup_one_as_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_one_as_acquire_monotonic( +; GCN-LABEL: {{^}}workgroup_acquire_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: buffer_gl0_inv +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10: .amdhsa_kernel workgroup_acquire_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_acquire_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_release_monotonic: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_one_as_release_monotonic( +} + +; GCN-LABEL: {{^}}workgroup_release_monotonic: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel workgroup_release_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_release_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic ret void } -; GCN-LABEL: {{^}}workgroup_one_as_acq_rel_monotonic: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_one_as_acq_rel_monotonic( +; GCN-LABEL: {{^}}workgroup_acq_rel_monotonic: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_acq_rel_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_acq_rel_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic ret void } -; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_monotonic: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_one_as_seq_cst_monotonic( +; GCN-LABEL: {{^}}workgroup_seq_cst_monotonic: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_seq_cst_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_seq_cst_monotonic( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic ret void } -; GCN-LABEL: {{^}}workgroup_one_as_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_one_as_acquire_acquire( +; GCN-LABEL: {{^}}workgroup_acquire_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: buffer_gl0_inv +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10: .amdhsa_kernel workgroup_acquire_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_acquire_acquire( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire ret void } -; GCN-LABEL: {{^}}workgroup_one_as_release_acquire: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_one_as_release_acquire( +; GCN-LABEL: {{^}}workgroup_release_acquire: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_release_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_release_acquire( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire ret void } -; GCN-LABEL: {{^}}workgroup_one_as_acq_rel_acquire: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_one_as_acq_rel_acquire( +; GCN-LABEL: {{^}}workgroup_acq_rel_acquire: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_acq_rel_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_acq_rel_acquire( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire ret void } -; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_acquire: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_one_as_seq_cst_acquire( +; GCN-LABEL: {{^}}workgroup_seq_cst_acquire: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_seq_cst_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_seq_cst_acquire( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire ret void } -; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_seq_cst: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @workgroup_one_as_seq_cst_seq_cst( +; GCN-LABEL: {{^}}workgroup_seq_cst_seq_cst: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_seq_cst_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_seq_cst_seq_cst( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst ret void } ; GCN-LABEL: {{^}}wavefront_monotonic_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_monotonic_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_monotonic_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -1055,10 +2526,17 @@ } ; GCN-LABEL: {{^}}wavefront_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_acquire_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_acquire_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -1068,10 +2546,17 @@ } ; GCN-LABEL: {{^}}wavefront_release_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_release_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_release_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -1081,10 +2566,17 @@ } ; GCN-LABEL: {{^}}wavefront_acq_rel_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_acq_rel_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_acq_rel_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -1094,10 +2586,17 @@ } ; GCN-LABEL: {{^}}wavefront_seq_cst_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_seq_cst_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_seq_cst_monotonic( i32* %out, i32 %in, i32 %old) { entry: @@ -1107,10 +2606,17 @@ } ; GCN-LABEL: {{^}}wavefront_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_acquire_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_acquire_acquire( i32* %out, i32 %in, i32 %old) { entry: @@ -1120,10 +2626,17 @@ } ; GCN-LABEL: {{^}}wavefront_release_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_release_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_release_acquire( i32* %out, i32 %in, i32 %old) { entry: @@ -1133,10 +2646,17 @@ } ; GCN-LABEL: {{^}}wavefront_acq_rel_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_acq_rel_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_acq_rel_acquire( i32* %out, i32 %in, i32 %old) { entry: @@ -1146,10 +2666,17 @@ } ; GCN-LABEL: {{^}}wavefront_seq_cst_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_seq_cst_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_seq_cst_acquire( i32* %out, i32 %in, i32 %old) { entry: @@ -1159,10 +2686,17 @@ } ; GCN-LABEL: {{^}}wavefront_seq_cst_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}._inv +; GFX10: .amdhsa_kernel wavefront_seq_cst_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_seq_cst_seq_cst( i32* %out, i32 %in, i32 %old) { entry: @@ -1171,132 +2705,588 @@ ret void } -; GCN-LABEL: {{^}}wavefront_one_as_monotonic_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @wavefront_one_as_monotonic_monotonic( +; GCN-LABEL: {{^}}system_acquire_monotonic_ret: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_acquire_monotonic_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_acquire_monotonic_ret( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -; GCN-LABEL: {{^}}wavefront_one_as_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @wavefront_one_as_acquire_monotonic( +; GCN-LABEL: {{^}}system_acq_rel_monotonic_ret: +; GCN: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_acq_rel_monotonic_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_acq_rel_monotonic_ret( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -; GCN-LABEL: {{^}}wavefront_one_as_release_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @wavefront_one_as_release_monotonic( +; GCN-LABEL: {{^}}system_seq_cst_monotonic_ret: +; GCN: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_seq_cst_monotonic_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_seq_cst_monotonic_ret( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -; GCN-LABEL: {{^}}wavefront_one_as_acq_rel_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @wavefront_one_as_acq_rel_monotonic( +; GCN-LABEL: {{^}}system_acquire_acquire_ret: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_acquire_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_acquire_acquire_ret( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @wavefront_one_as_seq_cst_monotonic( +; GCN-LABEL: {{^}}system_release_acquire_ret: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_release_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_release_acquire_ret( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -; GCN-LABEL: {{^}}wavefront_one_as_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @wavefront_one_as_acquire_acquire( +; GCN-LABEL: {{^}}system_acq_rel_acquire_ret: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_acq_rel_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_acq_rel_acquire_ret( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -; GCN-LABEL: {{^}}wavefront_one_as_release_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @wavefront_one_as_release_acquire( +; GCN-LABEL: {{^}}system_seq_cst_acquire_ret: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_seq_cst_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_seq_cst_acquire_ret( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -; GCN-LABEL: {{^}}wavefront_one_as_acq_rel_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @wavefront_one_as_acq_rel_acquire( +; GCN-LABEL: {{^}}system_seq_cst_seq_cst_ret: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_seq_cst_seq_cst_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_seq_cst_seq_cst_ret( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @wavefront_one_as_seq_cst_acquire( +; GCN-LABEL: {{^}}agent_acquire_monotonic_ret: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_acquire_monotonic_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_acquire_monotonic_ret( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } -; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol -define amdgpu_kernel void @wavefront_one_as_seq_cst_seq_cst( +; GCN-LABEL: {{^}}agent_acq_rel_monotonic_ret: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_acq_rel_monotonic_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_acq_rel_monotonic_ret( i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_seq_cst_monotonic_ret: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_seq_cst_monotonic_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_seq_cst_monotonic_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_acquire_acquire_ret: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_acquire_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_acquire_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_release_acquire_ret: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_release_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_release_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_acq_rel_acquire_ret: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_acq_rel_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_acq_rel_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_seq_cst_acquire_ret: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_seq_cst_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_seq_cst_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_seq_cst_seq_cst_ret: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_seq_cst_seq_cst_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_seq_cst_seq_cst_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_acquire_monotonic_ret: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: buffer_gl0_inv +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10: .amdhsa_kernel workgroup_acquire_monotonic_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_acquire_monotonic_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_acq_rel_monotonic_ret: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10CU: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_acq_rel_monotonic_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_acq_rel_monotonic_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_seq_cst_monotonic_ret: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10CU: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_seq_cst_monotonic_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_seq_cst_monotonic_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_acquire_acquire_ret: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: buffer_gl0_inv +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10: .amdhsa_kernel workgroup_acquire_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_acquire_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_release_acquire_ret: +; GFX8: s_waitcnt lgkmcnt(0){{$}} +; GFX8: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX8: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10CU: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_release_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_release_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_acq_rel_acquire_ret: +; GFX8: s_waitcnt lgkmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10CU: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_acq_rel_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_acq_rel_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_seq_cst_acquire_ret: +; GFX8: s_waitcnt lgkmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10CU: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_seq_cst_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_seq_cst_acquire_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_seq_cst_seq_cst_ret: +; GFX8: s_waitcnt lgkmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10CU: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_seq_cst_seq_cst_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_seq_cst_seq_cst_ret( + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 ret void } Index: test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll +++ test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll @@ -1,6 +1,8 @@ ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX6,GFX68 %s ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX8,GFX68 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX8,GFX68 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX10,GFX10WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX10,GFX10CU %s ; FUNC-LABEL: {{^}}system_one_as_acquire: ; GCN: %bb.0 @@ -9,7 +11,15 @@ ; GFX6-NEXT: buffer_wbinvl1{{$}} ; GFX8: s_waitcnt vmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol{{$}} -; GCN: s_endpgm +; GFX10: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10-NEXT: buffer_gl0_inv{{$}} +; GFX10-NEXT: buffer_gl1_inv{{$}} +; GCN: s_endpgm +; GFX10: .amdhsa_kernel system_one_as_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_acquire() { entry: fence syncscope("one-as") acquire @@ -20,7 +30,12 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN: s_endpgm +; GFX10: .amdhsa_kernel system_one_as_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_release() { entry: fence syncscope("one-as") release @@ -31,9 +46,16 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX6: buffer_wbinvl1{{$}} ; GFX8: buffer_wbinvl1_vol{{$}} +; GFX10-NEXT: buffer_gl0_inv{{$}} +; GFX10-NEXT: buffer_gl1_inv{{$}} ; GCN: s_endpgm +; GFX10: .amdhsa_kernel system_one_as_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_acq_rel() { entry: fence syncscope("one-as") acq_rel @@ -44,9 +66,16 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX6: buffer_wbinvl1{{$}} ; GFX8: buffer_wbinvl1_vol{{$}} +; GFX10-NEXT: buffer_gl0_inv{{$}} +; GFX10-NEXT: buffer_gl1_inv{{$}} ; GCN: s_endpgm +; GFX10: .amdhsa_kernel system_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_seq_cst() { entry: fence syncscope("one-as") seq_cst @@ -57,6 +86,10 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm +; GFX10: .amdhsa_kernel singlethread_one_as_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_acquire() { entry: fence syncscope("singlethread-one-as") acquire @@ -67,6 +100,10 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm +; GFX10: .amdhsa_kernel singlethread_one_as_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_release() { entry: fence syncscope("singlethread-one-as") release @@ -77,6 +114,10 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm +; GFX10: .amdhsa_kernel singlethread_one_as_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_acq_rel() { entry: fence syncscope("singlethread-one-as") acq_rel @@ -87,6 +128,10 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm +; GFX10: .amdhsa_kernel singlethread_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_seq_cst() { entry: fence syncscope("singlethread-one-as") seq_cst @@ -100,7 +145,15 @@ ; GFX6-NEXT: buffer_wbinvl1{{$}} ; GFX8: s_waitcnt vmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol{{$}} -; GCN: s_endpgm +; GFX10: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10-NEXT: buffer_gl0_inv{{$}} +; GFX10-NEXT: buffer_gl1_inv{{$}} +; GCN: s_endpgm +; GFX10: .amdhsa_kernel agent_one_as_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_acquire() { entry: fence syncscope("agent-one-as") acquire @@ -111,7 +164,12 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN: s_endpgm +; GFX10: .amdhsa_kernel agent_one_as_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_release() { entry: fence syncscope("agent-one-as") release @@ -122,9 +180,16 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX6: buffer_wbinvl1{{$}} ; GFX8: buffer_wbinvl1_vol{{$}} +; GFX10-NEXT: buffer_gl0_inv{{$}} +; GFX10-NEXT: buffer_gl1_inv{{$}} ; GCN: s_endpgm +; GFX10: .amdhsa_kernel agent_one_as_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_acq_rel() { entry: fence syncscope("agent-one-as") acq_rel @@ -135,53 +200,99 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX6: buffer_wbinvl1{{$}} ; GFX8: buffer_wbinvl1_vol{{$}} +; GFX10-NEXT: buffer_gl0_inv{{$}} +; GFX10-NEXT: buffer_gl1_inv{{$}} ; GCN: s_endpgm +; GFX10: .amdhsa_kernel agent_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_seq_cst() { entry: fence syncscope("agent-one-as") seq_cst ret void } -; FUNC-LABEL: {{^}}workgroup_one_as_acquire: -; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm +; FUNC-LABEL: {{^}}workgroup_one_as_acquire: +; GCN: %bb.0 +; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10WGP-NEXT: buffer_gl0_inv{{$}} +; GFX10CU-NOT: buffer_gl0_inv{{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +; GFX10: .amdhsa_kernel workgroup_one_as_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_acquire() { entry: fence syncscope("workgroup-one-as") acquire ret void } -; FUNC-LABEL: {{^}}workgroup_one_as_release: -; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm +; FUNC-LABEL: {{^}}workgroup_one_as_release: +; GCN: %bb.0 +; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10-NOT: buffer_gl0_inv +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +; GFX10: .amdhsa_kernel workgroup_one_as_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_release() { entry: fence syncscope("workgroup-one-as") release ret void } -; FUNC-LABEL: {{^}}workgroup_one_as_acq_rel: -; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm +; FUNC-LABEL: {{^}}workgroup_one_as_acq_rel: +; GCN: %bb.0 +; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10WGP-NEXT: buffer_gl0_inv{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: buffer_gl0_inv{{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +; GFX10: .amdhsa_kernel workgroup_one_as_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_acq_rel() { entry: fence syncscope("workgroup-one-as") acq_rel ret void } -; FUNC-LABEL: {{^}}workgroup_one_as_seq_cst: -; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm +; FUNC-LABEL: {{^}}workgroup_one_as_seq_cst: +; GCN: %bb.0 +; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10WGP-NEXT: buffer_gl0_inv{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: buffer_gl0_inv{{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_seq_cst() { entry: fence syncscope("workgroup-one-as") seq_cst @@ -192,6 +303,10 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm +; GFX10: .amdhsa_kernel wavefront_one_as_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_one_as_acquire() { entry: fence syncscope("wavefront-one-as") acquire @@ -202,6 +317,10 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm +; GFX10: .amdhsa_kernel wavefront_one_as_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_one_as_release() { entry: fence syncscope("wavefront-one-as") release @@ -212,6 +331,10 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm +; GFX10: .amdhsa_kernel wavefront_one_as_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_one_as_acq_rel() { entry: fence syncscope("wavefront-one-as") acq_rel @@ -222,6 +345,10 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm +; GFX10: .amdhsa_kernel wavefront_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_one_as_seq_cst() { entry: fence syncscope("wavefront-one-as") seq_cst @@ -235,7 +362,15 @@ ; GFX6-NEXT: buffer_wbinvl1{{$}} ; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol{{$}} -; GCN: s_endpgm +; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10-NEXT: buffer_gl0_inv{{$}} +; GFX10-NEXT: buffer_gl1_inv{{$}} +; GCN: s_endpgm +; GFX10: .amdhsa_kernel system_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_acquire() { entry: fence acquire @@ -245,8 +380,15 @@ ; FUNC-LABEL: {{^}}system_release: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN: s_endpgm +; GFX10: .amdhsa_kernel system_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_release() { entry: fence release @@ -256,10 +398,19 @@ ; FUNC-LABEL: {{^}}system_acq_rel: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX6: buffer_wbinvl1{{$}} ; GFX8: buffer_wbinvl1_vol{{$}} +; GFX10-NEXT: buffer_gl0_inv{{$}} +; GFX10-NEXT: buffer_gl1_inv{{$}} ; GCN: s_endpgm +; GFX10: .amdhsa_kernel system_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_acq_rel() { entry: fence acq_rel @@ -269,10 +420,19 @@ ; FUNC-LABEL: {{^}}system_seq_cst: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX6: buffer_wbinvl1{{$}} ; GFX8: buffer_wbinvl1_vol{{$}} +; GFX10-NEXT: buffer_gl0_inv{{$}} +; GFX10-NEXT: buffer_gl1_inv{{$}} ; GCN: s_endpgm +; GFX10: .amdhsa_kernel system_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_seq_cst() { entry: fence seq_cst @@ -283,6 +443,10 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm +; GFX10: .amdhsa_kernel singlethread_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_acquire() { entry: fence syncscope("singlethread") acquire @@ -293,6 +457,10 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm +; GFX10: .amdhsa_kernel singlethread_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_release() { entry: fence syncscope("singlethread") release @@ -303,6 +471,10 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm +; GFX10: .amdhsa_kernel singlethread_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_acq_rel() { entry: fence syncscope("singlethread") acq_rel @@ -313,6 +485,10 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm +; GFX10: .amdhsa_kernel singlethread_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_seq_cst() { entry: fence syncscope("singlethread") seq_cst @@ -326,7 +502,15 @@ ; GFX6-NEXT: buffer_wbinvl1{{$}} ; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NEXT: buffer_wbinvl1_vol{{$}} -; GCN: s_endpgm +; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10-NEXT: buffer_gl0_inv{{$}} +; GFX10-NEXT: buffer_gl1_inv{{$}} +; GCN: s_endpgm +; GFX10: .amdhsa_kernel agent_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_acquire() { entry: fence syncscope("agent") acquire @@ -336,8 +520,15 @@ ; FUNC-LABEL: {{^}}agent_release: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN: s_endpgm +; GFX10: .amdhsa_kernel agent_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_release() { entry: fence syncscope("agent") release @@ -347,10 +538,19 @@ ; FUNC-LABEL: {{^}}agent_acq_rel: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX6: buffer_wbinvl1{{$}} ; GFX8: buffer_wbinvl1_vol{{$}} +; GFX10-NEXT: buffer_gl0_inv{{$}} +; GFX10-NEXT: buffer_gl1_inv{{$}} ; GCN: s_endpgm +; GFX10: .amdhsa_kernel agent_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_acq_rel() { entry: fence syncscope("agent") acq_rel @@ -360,54 +560,102 @@ ; FUNC-LABEL: {{^}}agent_seq_cst: ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX6: buffer_wbinvl1{{$}} ; GFX8: buffer_wbinvl1_vol{{$}} +; GFX10-NEXT: buffer_gl0_inv{{$}} +; GFX10-NEXT: buffer_gl1_inv{{$}} ; GCN: s_endpgm +; GFX10: .amdhsa_kernel agent_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_seq_cst() { entry: fence syncscope("agent") seq_cst ret void } -; FUNC-LABEL: {{^}}workgroup_acquire: -; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm +; FUNC-LABEL: {{^}}workgroup_acquire: +; GCN: %bb.0 +; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10WGP-NEXT: buffer_gl0_inv{{$}} +; GFX10CU-NOT: buffer_gl0_inv{{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +; GFX10: .amdhsa_kernel workgroup_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_acquire() { entry: fence syncscope("workgroup") acquire ret void } -; FUNC-LABEL: {{^}}workgroup_release: -; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm +; FUNC-LABEL: {{^}}workgroup_release: +; GCN: %bb.0 +; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10-NOT: buffer_gl0_inv +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +; GFX10: .amdhsa_kernel workgroup_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_release() { entry: fence syncscope("workgroup") release ret void } -; FUNC-LABEL: {{^}}workgroup_acq_rel: -; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm +; FUNC-LABEL: {{^}}workgroup_acq_rel: +; GCN: %bb.0 +; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10WGP-NEXT: buffer_gl0_inv{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: buffer_gl0_inv{{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +; GFX10: .amdhsa_kernel workgroup_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_acq_rel() { entry: fence syncscope("workgroup") acq_rel ret void } -; FUNC-LABEL: {{^}}workgroup_seq_cst: -; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm +; FUNC-LABEL: {{^}}workgroup_seq_cst: +; GCN: %bb.0 +; GFX68-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10WGP-NEXT: buffer_gl0_inv{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: buffer_gl0_inv{{$}} +; GCN-NOT: ATOMIC_FENCE +; GCN: s_endpgm +; GFX10: .amdhsa_kernel workgroup_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_seq_cst() { entry: fence syncscope("workgroup") seq_cst @@ -418,6 +666,10 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm +; GFX10: .amdhsa_kernel wavefront_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_acquire() { entry: fence syncscope("wavefront") acquire @@ -428,6 +680,10 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm +; GFX10: .amdhsa_kernel wavefront_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_release() { entry: fence syncscope("wavefront") release @@ -438,6 +694,10 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm +; GFX10: .amdhsa_kernel wavefront_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_acq_rel() { entry: fence syncscope("wavefront") acq_rel @@ -448,6 +708,10 @@ ; GCN: %bb.0 ; GCN-NOT: ATOMIC_FENCE ; GCN: s_endpgm +; GFX10: .amdhsa_kernel wavefront_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_seq_cst() { entry: fence syncscope("wavefront") seq_cst Index: test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll +++ test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll @@ -1,11 +1,19 @@ ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10CU %s ; GCN-LABEL: {{^}}system_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel system_one_as_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_monotonic( i32* %out, i32 %in) { entry: @@ -15,9 +23,17 @@ ; GCN-LABEL: {{^}}system_one_as_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_acquire( i32* %out, i32 %in) { entry: @@ -27,9 +43,15 @@ ; GCN-LABEL: {{^}}system_one_as_release: ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel system_one_as_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_release( i32* %out, i32 %in) { entry: @@ -39,9 +61,17 @@ ; GCN-LABEL: {{^}}system_one_as_acq_rel: ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_acq_rel( i32* %out, i32 %in) { entry: @@ -51,9 +81,17 @@ ; GCN-LABEL: {{^}}system_one_as_seq_cst: ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_seq_cst( i32* %out, i32 %in) { entry: @@ -63,9 +101,15 @@ ; GCN-LABEL: {{^}}singlethread_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel singlethread_one_as_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_monotonic( i32* %out, i32 %in) { entry: @@ -75,9 +119,15 @@ ; GCN-LABEL: {{^}}singlethread_one_as_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel singlethread_one_as_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_acquire( i32* %out, i32 %in) { entry: @@ -87,9 +137,15 @@ ; GCN-LABEL: {{^}}singlethread_one_as_release: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel singlethread_one_as_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_release( i32* %out, i32 %in) { entry: @@ -99,9 +155,15 @@ ; GCN-LABEL: {{^}}singlethread_one_as_acq_rel: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel singlethread_one_as_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_acq_rel( i32* %out, i32 %in) { entry: @@ -111,9 +173,15 @@ ; GCN-LABEL: {{^}}singlethread_one_as_seq_cst: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel singlethread_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_seq_cst( i32* %out, i32 %in) { entry: @@ -123,9 +191,15 @@ ; GCN-LABEL: {{^}}agent_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel agent_one_as_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_monotonic( i32* %out, i32 %in) { entry: @@ -135,9 +209,17 @@ ; GCN-LABEL: {{^}}agent_one_as_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_acquire( i32* %out, i32 %in) { entry: @@ -147,9 +229,15 @@ ; GCN-LABEL: {{^}}agent_one_as_release: ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel agent_one_as_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_release( i32* %out, i32 %in) { entry: @@ -159,9 +247,17 @@ ; GCN-LABEL: {{^}}agent_one_as_acq_rel: ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_acq_rel( i32* %out, i32 %in) { entry: @@ -171,9 +267,17 @@ ; GCN-LABEL: {{^}}agent_one_as_seq_cst: ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_seq_cst( i32* %out, i32 %in) { entry: @@ -183,9 +287,15 @@ ; GCN-LABEL: {{^}}workgroup_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel workgroup_one_as_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_monotonic( i32* %out, i32 %in) { entry: @@ -193,11 +303,20 @@ ret void } -; GCN-LABEL: {{^}}workgroup_one_as_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol +; GCN-LABEL: {{^}}workgroup_one_as_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_one_as_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_acquire( i32* %out, i32 %in) { entry: @@ -205,11 +324,20 @@ ret void } -; GCN-LABEL: {{^}}workgroup_one_as_release: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-LABEL: {{^}}workgroup_one_as_release: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel workgroup_one_as_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_release( i32* %out, i32 %in) { entry: @@ -217,11 +345,23 @@ ret void } -; GCN-LABEL: {{^}}workgroup_one_as_acq_rel: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol +; GCN-LABEL: {{^}}workgroup_one_as_acq_rel: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_one_as_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_acq_rel( i32* %out, i32 %in) { entry: @@ -229,11 +369,23 @@ ret void } -; GCN-LABEL: {{^}}workgroup_one_as_seq_cst: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_seq_cst( i32* %out, i32 %in) { entry: @@ -243,9 +395,15 @@ ; GCN-LABEL: {{^}}wavefront_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel wavefront_one_as_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_one_as_monotonic( i32* %out, i32 %in) { entry: @@ -255,9 +413,15 @@ ; GCN-LABEL: {{^}}wavefront_one_as_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel wavefront_one_as_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_one_as_acquire( i32* %out, i32 %in) { entry: @@ -267,9 +431,15 @@ ; GCN-LABEL: {{^}}wavefront_one_as_release: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel wavefront_one_as_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_one_as_release( i32* %out, i32 %in) { entry: @@ -279,9 +449,15 @@ ; GCN-LABEL: {{^}}wavefront_one_as_acq_rel: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel wavefront_one_as_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_one_as_acq_rel( i32* %out, i32 %in) { entry: @@ -291,9 +467,15 @@ ; GCN-LABEL: {{^}}wavefront_one_as_seq_cst: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel wavefront_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_one_as_seq_cst( i32* %out, i32 %in) { entry: @@ -301,11 +483,209 @@ ret void } +; GCN-LABEL: {{^}}system_one_as_acquire_ret: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_one_as_acquire_ret( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire + store i32 %val, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}system_one_as_acq_rel_ret: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_acq_rel_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_one_as_acq_rel_ret( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel + store i32 %val, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}system_one_as_seq_cst_ret: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_one_as_seq_cst_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_one_as_seq_cst_ret( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst + store i32 %val, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_acquire_ret: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_acquire_ret( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire + store i32 %val, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_acq_rel_ret: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_acq_rel_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_acq_rel_ret( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel + store i32 %val, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_one_as_seq_cst_ret: +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GCN-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_one_as_seq_cst_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_one_as_seq_cst_ret( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst + store i32 %val, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acquire_ret: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_one_as_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_acquire_ret( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire + store i32 %val, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_acq_rel_ret: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_one_as_acq_rel_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_acq_rel_ret( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel + store i32 %val, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_ret: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_one_as_seq_cst_ret( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst + store i32 %val, i32* %out, align 4 + ret void +} + ; GCN-LABEL: {{^}}system_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel system_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_monotonic( i32* %out, i32 %in) { entry: @@ -314,10 +694,19 @@ } ; GCN-LABEL: {{^}}system_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_acquire( i32* %out, i32 %in) { entry: @@ -326,10 +715,17 @@ } ; GCN-LABEL: {{^}}system_release: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel system_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_release( i32* %out, i32 %in) { entry: @@ -338,10 +734,20 @@ } ; GCN-LABEL: {{^}}system_acq_rel: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_acq_rel( i32* %out, i32 %in) { entry: @@ -350,10 +756,20 @@ } ; GCN-LABEL: {{^}}system_seq_cst: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_seq_cst( i32* %out, i32 %in) { entry: @@ -362,10 +778,16 @@ } ; GCN-LABEL: {{^}}singlethread_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel singlethread_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_monotonic( i32* %out, i32 %in) { entry: @@ -374,10 +796,16 @@ } ; GCN-LABEL: {{^}}singlethread_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel singlethread_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_acquire( i32* %out, i32 %in) { entry: @@ -386,10 +814,16 @@ } ; GCN-LABEL: {{^}}singlethread_release: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel singlethread_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_release( i32* %out, i32 %in) { entry: @@ -398,10 +832,16 @@ } ; GCN-LABEL: {{^}}singlethread_acq_rel: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel singlethread_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_acq_rel( i32* %out, i32 %in) { entry: @@ -410,10 +850,16 @@ } ; GCN-LABEL: {{^}}singlethread_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel singlethread_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_seq_cst( i32* %out, i32 %in) { entry: @@ -422,10 +868,16 @@ } ; GCN-LABEL: {{^}}agent_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel agent_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_monotonic( i32* %out, i32 %in) { entry: @@ -434,10 +886,19 @@ } ; GCN-LABEL: {{^}}agent_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_acquire( i32* %out, i32 %in) { entry: @@ -446,10 +907,17 @@ } ; GCN-LABEL: {{^}}agent_release: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel agent_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_release( i32* %out, i32 %in) { entry: @@ -458,10 +926,20 @@ } ; GCN-LABEL: {{^}}agent_acq_rel: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_acq_rel( i32* %out, i32 %in) { entry: @@ -470,10 +948,20 @@ } ; GCN-LABEL: {{^}}agent_seq_cst: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_seq_cst( i32* %out, i32 %in) { entry: @@ -482,10 +970,16 @@ } ; GCN-LABEL: {{^}}workgroup_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel workgroup_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_monotonic( i32* %out, i32 %in) { entry: @@ -493,11 +987,20 @@ ret void } -; GCN-LABEL: {{^}}workgroup_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol +; GCN-LABEL: {{^}}workgroup_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_acquire( i32* %out, i32 %in) { entry: @@ -505,11 +1008,20 @@ ret void } -; GCN-LABEL: {{^}}workgroup_release: -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-LABEL: {{^}}workgroup_release: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel workgroup_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_release( i32* %out, i32 %in) { entry: @@ -517,11 +1029,23 @@ ret void } -; GCN-LABEL: {{^}}workgroup_acq_rel: -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol +; GCN-LABEL: {{^}}workgroup_acq_rel: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_acq_rel( i32* %out, i32 %in) { entry: @@ -529,11 +1053,23 @@ ret void } -; GCN-LABEL: {{^}}workgroup_seq_cst: -; GFX8-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol +; GCN-LABEL: {{^}}workgroup_seq_cst: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_seq_cst( i32* %out, i32 %in) { entry: @@ -542,10 +1078,16 @@ } ; GCN-LABEL: {{^}}wavefront_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel wavefront_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_monotonic( i32* %out, i32 %in) { entry: @@ -554,10 +1096,16 @@ } ; GCN-LABEL: {{^}}wavefront_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel wavefront_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_acquire( i32* %out, i32 %in) { entry: @@ -566,10 +1114,16 @@ } ; GCN-LABEL: {{^}}wavefront_release: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel wavefront_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_release( i32* %out, i32 %in) { entry: @@ -578,10 +1132,16 @@ } ; GCN-LABEL: {{^}}wavefront_acq_rel: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel wavefront_acq_rel +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_acq_rel( i32* %out, i32 %in) { entry: @@ -590,13 +1150,221 @@ } ; GCN-LABEL: {{^}}wavefront_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: buffer_wbinvl1_vol +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} +; GFX10: .amdhsa_kernel wavefront_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_seq_cst( i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst ret void } + +; GCN-LABEL: {{^}}system_acquire_ret: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_acquire_ret( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in acquire + store i32 %val, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}system_acq_rel_ret: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_acq_rel_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_acq_rel_ret( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel + store i32 %val, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}system_seq_cst_ret: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel system_seq_cst_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @system_seq_cst_ret( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst + store i32 %val, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_acquire_ret: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_acquire_ret( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire + store i32 %val, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_acq_rel_ret: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_acq_rel_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_acq_rel_ret( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel + store i32 %val, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}agent_seq_cst_ret: +; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GCN-NEXT: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10: .amdhsa_kernel agent_seq_cst_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @agent_seq_cst_ret( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst + store i32 %val, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_acquire_ret: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_acquire_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_acquire_ret( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acquire + store i32 %val, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_acq_rel_ret: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_acq_rel_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_acq_rel_ret( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acq_rel + store i32 %val, i32* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}workgroup_seq_cst_ret: +; GFX8-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX8-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GFX10: .amdhsa_kernel workgroup_seq_cst_ret +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 +define amdgpu_kernel void @workgroup_seq_cst_ret( + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") seq_cst + store i32 %val, i32* %out, align 4 + ret void +} Index: test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll +++ test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll @@ -1,5 +1,6 @@ ; RUN: not llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s 2>&1 | FileCheck %s ; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s 2>&1 | FileCheck %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s 2>&1 | FileCheck %s ; CHECK: error: :0:0: in function invalid_fence void (): Unsupported atomic synchronization scope define amdgpu_kernel void @invalid_fence() { Index: test/CodeGen/AMDGPU/memory-legalizer-load.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-load.ll +++ test/CodeGen/AMDGPU/memory-legalizer-load.ll @@ -2,15 +2,24 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s declare i32 @llvm.amdgcn.workitem.id.x() ; GCN-LABEL: {{^}}system_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel system_one_as_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_unordered( i32* %in, i32* %out) { entry: @@ -21,10 +30,18 @@ ; GCN-LABEL: {{^}}system_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel system_one_as_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_monotonic( i32* %in, i32* %out) { entry: @@ -35,10 +52,18 @@ ; GCN-LABEL: {{^}}system_one_as_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0){{$}} ; GFX89-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel system_one_as_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_acquire( i32* %in, i32* %out) { entry: @@ -49,10 +74,18 @@ ; GCN-LABEL: {{^}}system_one_as_seq_cst: ; GCN: s_waitcnt vmcnt(0){{$}} -; GCN-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX89-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0){{$}} ; GFX89-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel system_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_seq_cst( i32* %in, i32* %out) { entry: @@ -63,10 +96,17 @@ ; GCN-LABEL: {{^}}singlethread_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel singlethread_one_as_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_unordered( i32* %in, i32* %out) { entry: @@ -77,10 +117,17 @@ ; GCN-LABEL: {{^}}singlethread_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel singlethread_one_as_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_monotonic( i32* %in, i32* %out) { entry: @@ -91,10 +138,17 @@ ; GCN-LABEL: {{^}}singlethread_one_as_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel singlethread_one_as_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_acquire( i32* %in, i32* %out) { entry: @@ -105,10 +159,17 @@ ; GCN-LABEL: {{^}}singlethread_one_as_seq_cst: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel singlethread_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_seq_cst( i32* %in, i32* %out) { entry: @@ -119,10 +180,17 @@ ; GCN-LABEL: {{^}}agent_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel agent_one_as_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_unordered( i32* %in, i32* %out) { entry: @@ -133,10 +201,18 @@ ; GCN-LABEL: {{^}}agent_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel agent_one_as_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_monotonic( i32* %in, i32* %out) { entry: @@ -147,10 +223,18 @@ ; GCN-LABEL: {{^}}agent_one_as_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0){{$}} ; GFX89-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel agent_one_as_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_acquire( i32* %in, i32* %out) { entry: @@ -161,10 +245,18 @@ ; GCN-LABEL: {{^}}agent_one_as_seq_cst: ; GCN: s_waitcnt vmcnt(0){{$}} -; GCN-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX89-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} ; GCN-NEXT: s_waitcnt vmcnt(0){{$}} ; GFX89-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel agent_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_seq_cst( i32* %in, i32* %out) { entry: @@ -175,10 +267,17 @@ ; GCN-LABEL: {{^}}workgroup_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel workgroup_one_as_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_unordered( i32* %in, i32* %out) { entry: @@ -187,12 +286,21 @@ ret void } -; GCN-LABEL: {{^}}workgroup_one_as_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GCN-LABEL: {{^}}workgroup_one_as_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10CU-NOT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel workgroup_one_as_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_monotonic( i32* %in, i32* %out) { entry: @@ -201,12 +309,23 @@ ret void } -; GCN-LABEL: {{^}}workgroup_one_as_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GFX89-NOT: s_waitcnt vmcnt(0){{$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GCN-LABEL: {{^}}workgroup_one_as_acquire: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10CU-NOT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX89-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel workgroup_one_as_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_acquire( i32* %in, i32* %out) { entry: @@ -215,12 +334,26 @@ ret void } -; GCN-LABEL: {{^}}workgroup_one_as_seq_cst: -; GFX89-NOT: s_waitcnt vmcnt(0){{$}} -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GFX89-NOT: s_waitcnt vmcnt(0){{$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst: +; GFX89-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0 +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10CU: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GFX89-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_seq_cst( i32* %in, i32* %out) { entry: @@ -231,10 +364,17 @@ ; GCN-LABEL: {{^}}wavefront_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel wavefront_one_as_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_one_as_unordered( i32* %in, i32* %out) { entry: @@ -245,10 +385,17 @@ ; GCN-LABEL: {{^}}wavefront_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel wavefront_one_as_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_one_as_monotonic( i32* %in, i32* %out) { entry: @@ -259,10 +406,17 @@ ; GCN-LABEL: {{^}}wavefront_one_as_acquire: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel wavefront_one_as_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_one_as_acquire( i32* %in, i32* %out) { entry: @@ -273,10 +427,17 @@ ; GCN-LABEL: {{^}}wavefront_one_as_seq_cst: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel wavefront_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_one_as_seq_cst( i32* %in, i32* %out) { entry: @@ -287,6 +448,11 @@ ; GCN-LABEL: {{^}}nontemporal_private_0: ; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} +; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}} +; GFX10: .amdhsa_kernel nontemporal_private_0 +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @nontemporal_private_0( i32 addrspace(5)* %in, i32* %out) { entry: @@ -297,6 +463,11 @@ ; GCN-LABEL: {{^}}nontemporal_private_1: ; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} +; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}} +; GFX10: .amdhsa_kernel nontemporal_private_1 +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @nontemporal_private_1( i32 addrspace(5)* %in, i32* %out) { entry: @@ -309,6 +480,10 @@ ; GCN-LABEL: {{^}}nontemporal_global_0: ; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0x0{{$}} +; GFX10: .amdhsa_kernel nontemporal_global_0 +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @nontemporal_global_0( i32 addrspace(1)* %in, i32* %out) { entry: @@ -320,6 +495,11 @@ ; GCN-LABEL: {{^}}nontemporal_global_1: ; GFX8: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} ; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} +; GFX10: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] slc{{$}} +; GFX10: .amdhsa_kernel nontemporal_global_1 +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @nontemporal_global_1( i32 addrspace(1)* %in, i32* %out) { entry: @@ -332,6 +512,10 @@ ; GCN-LABEL: {{^}}nontemporal_local_0: ; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel nontemporal_local_0 +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @nontemporal_local_0( i32 addrspace(3)* %in, i32* %out) { entry: @@ -342,6 +526,10 @@ ; GCN-LABEL: {{^}}nontemporal_local_1: ; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel nontemporal_local_1 +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @nontemporal_local_1( i32 addrspace(3)* %in, i32* %out) { entry: @@ -354,6 +542,11 @@ ; GCN-LABEL: {{^}}nontemporal_flat_0: ; GFX89: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} +; GFX10: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] slc{{$}} +; GFX10: .amdhsa_kernel nontemporal_flat_0 +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @nontemporal_flat_0( i32* %in, i32* %out) { entry: @@ -364,6 +557,11 @@ ; GCN-LABEL: {{^}}nontemporal_flat_1: ; GFX89: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} +; GFX10: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] slc{{$}} +; GFX10: .amdhsa_kernel nontemporal_flat_1 +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @nontemporal_flat_1( i32* %in, i32* %out) { entry: @@ -375,11 +573,18 @@ } ; GCN-LABEL: {{^}}system_unordered: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel system_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_unordered( i32* %in, i32* %out) { entry: @@ -389,11 +594,19 @@ } ; GCN-LABEL: {{^}}system_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel system_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_monotonic( i32* %in, i32* %out) { entry: @@ -403,11 +616,20 @@ } ; GCN-LABEL: {{^}}system_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} +; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX89-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel system_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_acquire( i32* %in, i32* %out) { entry: @@ -417,11 +639,21 @@ } ; GCN-LABEL: {{^}}system_seq_cst: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GFX89-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} +; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX89-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel system_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_seq_cst( i32* %in, i32* %out) { entry: @@ -431,11 +663,18 @@ } ; GCN-LABEL: {{^}}singlethread_unordered: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel singlethread_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_unordered( i32* %in, i32* %out) { entry: @@ -445,11 +684,18 @@ } ; GCN-LABEL: {{^}}singlethread_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel singlethread_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_monotonic( i32* %in, i32* %out) { entry: @@ -459,11 +705,18 @@ } ; GCN-LABEL: {{^}}singlethread_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel singlethread_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_acquire( i32* %in, i32* %out) { entry: @@ -473,11 +726,18 @@ } ; GCN-LABEL: {{^}}singlethread_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel singlethread_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_seq_cst( i32* %in, i32* %out) { entry: @@ -487,11 +747,18 @@ } ; GCN-LABEL: {{^}}agent_unordered: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel agent_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_unordered( i32* %in, i32* %out) { entry: @@ -501,11 +768,19 @@ } ; GCN-LABEL: {{^}}agent_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel agent_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_monotonic( i32* %in, i32* %out) { entry: @@ -515,11 +790,20 @@ } ; GCN-LABEL: {{^}}agent_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} +; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX89-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel agent_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_acquire( i32* %in, i32* %out) { entry: @@ -529,11 +813,21 @@ } ; GCN-LABEL: {{^}}agent_seq_cst: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} +; GFX89-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} +; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX89-NEXT: buffer_wbinvl1_vol +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel agent_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_seq_cst( i32* %in, i32* %out) { entry: @@ -543,11 +837,18 @@ } ; GCN-LABEL: {{^}}workgroup_unordered: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel workgroup_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_unordered( i32* %in, i32* %out) { entry: @@ -556,12 +857,21 @@ ret void } -; GCN-LABEL: {{^}}workgroup_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GCN-LABEL: {{^}}workgroup_monotonic: +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10CU-NOT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel workgroup_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_monotonic( i32* %in, i32* %out) { entry: @@ -570,12 +880,21 @@ ret void } -; GCN-LABEL: {{^}}workgroup_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GCN-LABEL: {{^}}workgroup_acquire: +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10CU-NOT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX89-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel workgroup_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_acquire( i32* %in, i32* %out) { entry: @@ -584,12 +903,25 @@ ret void } -; GCN-LABEL: {{^}}workgroup_seq_cst: -; GFX89-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GCN-LABEL: {{^}}workgroup_seq_cst: +; GFX89-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0 +; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GFX10CU: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; GFX89-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89-NOT: buffer_wbinvl1_vol +; GFX10WGP-NEXT: buffer_gl0_inv +; GFX10CU-NOT: buffer_gl0_inv +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel workgroup_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_seq_cst( i32* %in, i32* %out) { entry: @@ -599,11 +931,18 @@ } ; GCN-LABEL: {{^}}wavefront_unordered: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel wavefront_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_unordered( i32* %in, i32* %out) { entry: @@ -613,11 +952,18 @@ } ; GCN-LABEL: {{^}}wavefront_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel wavefront_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_monotonic( i32* %in, i32* %out) { entry: @@ -627,11 +973,18 @@ } ; GCN-LABEL: {{^}}wavefront_acquire: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel wavefront_acquire +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_acquire( i32* %in, i32* %out) { entry: @@ -641,11 +994,18 @@ } ; GCN-LABEL: {{^}}wavefront_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GFX89-NOT: buffer_wbinvl1_vol +; GFX10-NOT: buffer_gl{{[01]}}_inv ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +; GFX10: .amdhsa_kernel wavefront_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_seq_cst( i32* %in, i32* %out) { entry: Index: test/CodeGen/AMDGPU/memory-legalizer-store.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-store.ll +++ test/CodeGen/AMDGPU/memory-legalizer-store.ll @@ -2,12 +2,19 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s declare i32 @llvm.amdgcn.workitem.id.x() ; GCN-LABEL: {{^}}system_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel system_one_as_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_unordered( i32 %in, i32* %out) { entry: @@ -17,7 +24,12 @@ ; GCN-LABEL: {{^}}system_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel system_one_as_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_monotonic( i32 %in, i32* %out) { entry: @@ -27,7 +39,12 @@ ; GCN-LABEL: {{^}}system_one_as_release: ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel system_one_as_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_release( i32 %in, i32* %out) { entry: @@ -37,7 +54,12 @@ ; GCN-LABEL: {{^}}system_one_as_seq_cst: ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel system_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_one_as_seq_cst( i32 %in, i32* %out) { entry: @@ -47,7 +69,12 @@ ; GCN-LABEL: {{^}}singlethread_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel singlethread_one_as_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_unordered( i32 %in, i32* %out) { entry: @@ -57,7 +84,12 @@ ; GCN-LABEL: {{^}}singlethread_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel singlethread_one_as_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_monotonic( i32 %in, i32* %out) { entry: @@ -67,7 +99,12 @@ ; GCN-LABEL: {{^}}singlethread_one_as_release: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel singlethread_one_as_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_release( i32 %in, i32* %out) { entry: @@ -77,7 +114,12 @@ ; GCN-LABEL: {{^}}singlethread_one_as_seq_cst: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel singlethread_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_one_as_seq_cst( i32 %in, i32* %out) { entry: @@ -87,7 +129,12 @@ ; GCN-LABEL: {{^}}agent_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel agent_one_as_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_unordered( i32 %in, i32* %out) { entry: @@ -97,7 +144,12 @@ ; GCN-LABEL: {{^}}agent_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel agent_one_as_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_monotonic( i32 %in, i32* %out) { entry: @@ -107,7 +159,12 @@ ; GCN-LABEL: {{^}}agent_one_as_release: ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel agent_one_as_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_release( i32 %in, i32* %out) { entry: @@ -117,7 +174,12 @@ ; GCN-LABEL: {{^}}agent_one_as_seq_cst: ; GCN: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel agent_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_one_as_seq_cst( i32 %in, i32* %out) { entry: @@ -127,7 +189,12 @@ ; GCN-LABEL: {{^}}workgroup_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel workgroup_one_as_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_unordered( i32 %in, i32* %out) { entry: @@ -137,7 +204,12 @@ ; GCN-LABEL: {{^}}workgroup_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel workgroup_one_as_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_monotonic( i32 %in, i32* %out) { entry: @@ -145,9 +217,17 @@ ret void } -; GCN-LABEL: {{^}}workgroup_one_as_release: -; GFX89-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GCN-LABEL: {{^}}workgroup_one_as_release: +; GFX89-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel workgroup_one_as_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_release( i32 %in, i32* %out) { entry: @@ -155,9 +235,17 @@ ret void } -; GCN-LABEL: {{^}}workgroup_one_as_seq_cst: -; GFX89-NOT: s_waitcnt vmcnt(0){{$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GCN-LABEL: {{^}}workgroup_one_as_seq_cst: +; GFX89-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_one_as_seq_cst( i32 %in, i32* %out) { entry: @@ -167,7 +255,12 @@ ; GCN-LABEL: {{^}}wavefront_one_as_unordered: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel wavefront_one_as_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_one_as_unordered( i32 %in, i32* %out) { entry: @@ -177,7 +270,12 @@ ; GCN-LABEL: {{^}}wavefront_one_as_monotonic: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel wavefront_one_as_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_one_as_monotonic( i32 %in, i32* %out) { entry: @@ -187,7 +285,12 @@ ; GCN-LABEL: {{^}}wavefront_one_as_release: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel wavefront_one_as_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_one_as_release( i32 %in, i32* %out) { entry: @@ -197,7 +300,12 @@ ; GCN-LABEL: {{^}}wavefront_one_as_seq_cst: ; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel wavefront_one_as_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_one_as_seq_cst( i32 %in, i32* %out) { entry: @@ -207,6 +315,11 @@ ; GCN-LABEL: {{^}}nontemporal_private_0: ; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} +; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}} +; GFX10: .amdhsa_kernel nontemporal_private_0 +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @nontemporal_private_0( i32* %in, i32 addrspace(5)* %out) { entry: @@ -217,6 +330,11 @@ ; GCN-LABEL: {{^}}nontemporal_private_1: ; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} +; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}} +; GFX10: .amdhsa_kernel nontemporal_private_1 +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @nontemporal_private_1( i32* %in, i32 addrspace(5)* %out) { entry: @@ -230,6 +348,11 @@ ; GCN-LABEL: {{^}}nontemporal_global_0: ; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} ; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}} +; GFX10: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off slc{{$}} +; GFX10: .amdhsa_kernel nontemporal_global_0 +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @nontemporal_global_0( i32* %in, i32 addrspace(1)* %out) { entry: @@ -241,6 +364,11 @@ ; GCN-LABEL: {{^}}nontemporal_global_1: ; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} ; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} +; GFX10: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] slc{{$}} +; GFX10: .amdhsa_kernel nontemporal_global_1 +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @nontemporal_global_1( i32* %in, i32 addrspace(1)* %out) { entry: @@ -253,6 +381,10 @@ ; GCN-LABEL: {{^}}nontemporal_local_0: ; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel nontemporal_local_0 +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @nontemporal_local_0( i32* %in, i32 addrspace(3)* %out) { entry: @@ -263,6 +395,10 @@ ; GCN-LABEL: {{^}}nontemporal_local_1: ; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel nontemporal_local_1 +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @nontemporal_local_1( i32* %in, i32 addrspace(3)* %out) { entry: @@ -275,6 +411,11 @@ ; GCN-LABEL: {{^}}nontemporal_flat_0: ; GFX89: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} +; GFX10: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} slc{{$}} +; GFX10: .amdhsa_kernel nontemporal_flat_0 +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @nontemporal_flat_0( i32* %in, i32* %out) { entry: @@ -285,6 +426,11 @@ ; GCN-LABEL: {{^}}nontemporal_flat_1: ; GFX89: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} +; GFX10: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} slc{{$}} +; GFX10: .amdhsa_kernel nontemporal_flat_1 +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @nontemporal_flat_1( i32* %in, i32* %out) { entry: @@ -296,8 +442,13 @@ } ; GCN-LABEL: {{^}}system_unordered: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel system_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_unordered( i32 %in, i32* %out) { entry: @@ -306,8 +457,13 @@ } ; GCN-LABEL: {{^}}system_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel system_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_monotonic( i32 %in, i32* %out) { entry: @@ -316,8 +472,14 @@ } ; GCN-LABEL: {{^}}system_release: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel system_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_release( i32 %in, i32* %out) { entry: @@ -326,8 +488,14 @@ } ; GCN-LABEL: {{^}}system_seq_cst: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel system_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @system_seq_cst( i32 %in, i32* %out) { entry: @@ -336,8 +504,13 @@ } ; GCN-LABEL: {{^}}singlethread_unordered: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel singlethread_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_unordered( i32 %in, i32* %out) { entry: @@ -346,8 +519,13 @@ } ; GCN-LABEL: {{^}}singlethread_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel singlethread_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_monotonic( i32 %in, i32* %out) { entry: @@ -356,8 +534,13 @@ } ; GCN-LABEL: {{^}}singlethread_release: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel singlethread_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_release( i32 %in, i32* %out) { entry: @@ -366,8 +549,13 @@ } ; GCN-LABEL: {{^}}singlethread_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel singlethread_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @singlethread_seq_cst( i32 %in, i32* %out) { entry: @@ -376,8 +564,13 @@ } ; GCN-LABEL: {{^}}agent_unordered: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel agent_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_unordered( i32 %in, i32* %out) { entry: @@ -386,8 +579,13 @@ } ; GCN-LABEL: {{^}}agent_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel agent_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_monotonic( i32 %in, i32* %out) { entry: @@ -396,8 +594,14 @@ } ; GCN-LABEL: {{^}}agent_release: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel agent_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_release( i32 %in, i32* %out) { entry: @@ -406,8 +610,14 @@ } ; GCN-LABEL: {{^}}agent_seq_cst: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10: s_waitcnt lgkmcnt(0){{$}} +; GFX10: s_waitcnt_vscnt null, 0x0{{$}} ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel agent_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @agent_seq_cst( i32 %in, i32* %out) { entry: @@ -416,8 +626,13 @@ } ; GCN-LABEL: {{^}}workgroup_unordered: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel workgroup_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_unordered( i32 %in, i32* %out) { entry: @@ -426,8 +641,13 @@ } ; GCN-LABEL: {{^}}workgroup_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel workgroup_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_monotonic( i32 %in, i32* %out) { entry: @@ -435,9 +655,17 @@ ret void } -; GCN-LABEL: {{^}}workgroup_release: -; GFX89-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GCN-LABEL: {{^}}workgroup_release: +; GFX89-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel workgroup_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_release( i32 %in, i32* %out) { entry: @@ -445,9 +673,17 @@ ret void } -; GCN-LABEL: {{^}}workgroup_seq_cst: -; GFX89-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GCN-LABEL: {{^}}workgroup_seq_cst: +; GFX89-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} +; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel workgroup_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @workgroup_seq_cst( i32 %in, i32* %out) { entry: @@ -456,8 +692,13 @@ } ; GCN-LABEL: {{^}}wavefront_unordered: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel wavefront_unordered +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_unordered( i32 %in, i32* %out) { entry: @@ -466,8 +707,13 @@ } ; GCN-LABEL: {{^}}wavefront_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel wavefront_monotonic +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_monotonic( i32 %in, i32* %out) { entry: @@ -476,8 +722,13 @@ } ; GCN-LABEL: {{^}}wavefront_release: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel wavefront_release +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_release( i32 %in, i32* %out) { entry: @@ -486,8 +737,13 @@ } ; GCN-LABEL: {{^}}wavefront_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GCN-NOT: s_waitcnt vmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX10: .amdhsa_kernel wavefront_seq_cst +; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 +; GFX10CU: .amdhsa_workgroup_processor_mode 0 +; GFX10-NOT: .amdhsa_memory_ordered 0 define amdgpu_kernel void @wavefront_seq_cst( i32 %in, i32* %out) { entry: