diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -6036,25 +6036,30 @@ subsequent vector memory or LDS operation and so can be used to meet the requirements of acquire and release. * The L2 cache of one agent can be kept coherent with other agents by using - the MTYPE RW (read-write) for memory local to the L2, and MTYPE NC - (non-coherent) with the PTE C-bit set for memory not local to the L2. + the MTYPE CC (cache-coherent) with the PTE C-bit for memory local to the L2, + and MTYPE UC (uncached) with the PTE C-bit set for memory not local to the + L2. * Any local memory cache lines will be automatically invalidated by writes from CUs associated with other L2 caches, or writes from the CPU, due to - the cache probe caused by the PTE C-bit. - * XGMI accesses from the CPU to local memory may be cached on the CPU. + the cache probe caused by coherent requests. Coherent requests are caused + by GPU accesses to pages with the PTE C-bit set, by CPU accesses over + XGMI, and by PCIe requests that are configured to be coherent requests. + + * XGMI accesses from the CPU to local memory may be cached on the CPU. Subsequent access from the GPU will automatically invalidate or writeback - the CPU cache due to the L2 probe filter. + the CPU cache due to the L2 probe filter and and the PTE C-bit being set. * Since all work-groups on the same agent share the same L2, no L2 invalidation or writeback is required for coherence. - * To ensure coherence of local memory writes of work-groups in different - agents a ``buffer_wbl2`` is required. It will writeback dirty L2 cache - lines. - * To ensure coherence of local memory reads of work-groups in different - agents a ``buffer_invl2`` is required. It will invalidate non-local L2 - cache lines. - - * PCIe access from the GPU to the CPU memory can be kept coherent by using the + * Since local memory reads and writes of work-groups in different agents + access memory using MTYPE CC, no L2 invalidate or writeback is required + for coherence. MTYPE CC causes write through to DRAM and local reads to be + invalidated by remote writes with with the PTE C-bit. + * Since remote memory reads and writes of work-groups in different agents + access memory using MTYPE UC, no L2 invalidate or writeback is required + for coherence. MTYPE UC causes direct accesses to DRAM. + + * PCIe access from the GPU to the CPU memory is kept coherent by using the MTYPE UC (uncached) which bypasses the L2. Scalar memory operations are only used to access memory that is proven to not @@ -6116,7 +6121,7 @@ - volatile 1. buffer/global/flat_load - glc=1 scc=1 + glc=1 2. s_waitcnt vmcnt(0) - Must happen before @@ -6144,7 +6149,6 @@ - volatile 1. buffer/global/flat_store - scc=1 2. s_waitcnt vmcnt(0) - Must happen before @@ -6183,13 +6187,13 @@ load atomic monotonic - agent - global 1. buffer/global/flat_load - generic glc=1 load atomic monotonic - system - global 1. buffer/global/flat_load - - generic glc=1 scc=1 + - generic glc=1 store atomic monotonic - singlethread - global 1. buffer/global/flat_store - wavefront - generic - workgroup - agent store atomic monotonic - system - global 1. buffer/global/flat_store - - generic scc=1 + - generic store atomic monotonic - singlethread - local *If TgSplit execution mode, - wavefront local address space cannot - workgroup be used.* @@ -6200,7 +6204,7 @@ - workgroup - agent atomicrmw monotonic - system - global 1. buffer/global/flat_atomic - - generic scc=1 + - generic atomicrmw monotonic - singlethread - local *If TgSplit execution mode, - wavefront local address space cannot - workgroup be used.* @@ -6319,19 +6323,18 @@ stale global data. load atomic acquire - system - global 1. buffer/global/flat_load - glc=1 scc=1 + glc=1 2. s_waitcnt vmcnt(0) - Must happen before - following buffer_invl2 and + following buffer_wbinvl1_vol. - Ensures the load has completed before invalidating the cache. - 3. buffer_invl2; - buffer_wbinvl1_vol + 3. buffer_wbinvl1_vol - Must happen before any following @@ -6341,10 +6344,10 @@ - Ensures that following loads will not see - stale MTYPE NC global data. + stale L1 global data. MTYPE RW and CC memory will - never be stale due to the - memory probes. + never be stale in L2 due to + the memory probes. load atomic acquire - agent - generic 1. flat_load glc=1 2. s_waitcnt vmcnt(0) & @@ -6374,7 +6377,7 @@ will not see stale global data. - load atomic acquire - system - generic 1. flat_load glc=1 scc=1 + load atomic acquire - system - generic 1. flat_load glc=1 2. s_waitcnt vmcnt(0) & lgkmcnt(0) @@ -6384,15 +6387,13 @@ lgkmcnt(0). - Must happen before following - buffer_invl2 and buffer_wbinvl1_vol. - Ensures the flat_load has completed before invalidating the caches. - 3. buffer_invl2; - buffer_wbinvl1_vol + 3. buffer_wbinvl1_vol - Must happen before any following @@ -6401,11 +6402,11 @@ atomic/atomicrmw. - Ensures that following - loads will not see - stale MTYPE NC global data. + L1 loads will not see + stale global data. MTYPE RW and CC memory will - never be stale due to the - memory probes. + never be stale in L2 due to + the memory probes. atomicrmw acquire - singlethread - global 1. buffer/global/flat_atomic - wavefront - generic @@ -6518,11 +6519,10 @@ global data. atomicrmw acquire - system - global 1. buffer/global_atomic - scc=1 2. s_waitcnt vmcnt(0) - Must happen before - following buffer_invl2 and + following buffer_wbinvl1_vol. - Ensures the atomicrmw has @@ -6530,8 +6530,7 @@ invalidating the caches. - 3. buffer_invl2; - buffer_wbinvl1_vol + 3. buffer_wbinvl1_vol - Must happen before any following @@ -6541,10 +6540,10 @@ - Ensures that following loads will not see - stale MTYPE NC global data. - MTYPE RW and CC memory will - never be stale due to the - memory probes. + stale L1 global data. + MTYPE RW and CC L2 memory + never be stale in L2 due to + the memory probes. atomicrmw acquire - agent - generic 1. flat_atomic 2. s_waitcnt vmcnt(0) & @@ -6575,7 +6574,7 @@ will not see stale global data. - atomicrmw acquire - system - generic 1. flat_atomic scc=1 + atomicrmw acquire - system - generic 1. flat_atomic 2. s_waitcnt vmcnt(0) & lgkmcnt(0) @@ -6585,7 +6584,6 @@ lgkmcnt(0). - Must happen before following - buffer_invl2 and buffer_wbinvl1_vol. - Ensures the atomicrmw has @@ -6593,8 +6591,7 @@ invalidating the caches. - 3. buffer_invl2; - buffer_wbinvl1_vol + 3. buffer_wbinvl1_vol - Must happen before any following @@ -6604,10 +6601,10 @@ - Ensures that following loads will not see - stale MTYPE NC global data. + stale L1 global data. MTYPE RW and CC memory will - never be stale due to the - memory probes. + never be stale in L2 due to + the memory probes. fence acquire - singlethread *none* *none* - wavefront @@ -6818,7 +6815,7 @@ termed the fence-paired-atomic). - Must happen before - the following buffer_invl2 and + the following buffer_wbinvl1_vol. - Ensures that the fence-paired atomic @@ -6833,8 +6830,7 @@ the fence-paired-atomic. - 2. buffer_invl2; - buffer_wbinvl1_vol + 2. buffer_wbinvl1_vol - Must happen before any following global/generic @@ -6842,10 +6838,12 @@ atomic/store/store atomic/atomicrmw. - Ensures that - following loads - will not see stale - global data. - + following + loads will not see + stale L1 global data. + MTYPE RW and CC memory will + never be stale in L2 due to + the memory probes. **Release Atomic** ------------------------------------------------------------------------------------ store atomic release - singlethread - global 1. buffer/global/flat_store @@ -6936,18 +6934,8 @@ released. 2. buffer/global/flat_store - store atomic release - system - global 1. buffer_wbl2 - - generic - - Must happen before - following s_waitcnt. - - Performs L2 writeback to - ensure previous - global/generic - store/atomicrmw are - visible at system scope. - - 2. s_waitcnt lgkmcnt(0) & - vmcnt(0) + store atomic release - system - global 1. s_waitcnt lgkmcnt(0) & + - generic vmcnt(0) - If TgSplit execution mode, omit lgkmcnt(0). @@ -6991,7 +6979,6 @@ released. 2. buffer/global/flat_store - scc=1 atomicrmw release - singlethread - global 1. buffer/global/flat_atomic - wavefront - generic atomicrmw release - singlethread - local *If TgSplit execution mode, @@ -7079,18 +7066,8 @@ is being released. 2. buffer/global/flat_atomic - atomicrmw release - system - global 1. buffer_wbl2 - - generic - - Must happen before - following s_waitcnt. - - Performs L2 writeback to - ensure previous - global/generic - store/atomicrmw are - visible at system scope. - - 2. s_waitcnt lgkmcnt(0) & - vmcnt(0) + atomicrmw release - system - global 1. s_waitcnt lgkmcnt(0) & + - generic vmcnt(0) - If TgSplit execution mode, omit lgkmcnt(0). @@ -7131,8 +7108,7 @@ store that is being released. - 3. buffer/global/flat_atomic - scc=1 + 2. buffer/global/flat_atomic fence release - singlethread *none* *none* - wavefront fence release - workgroup *none* 1. s_waitcnt lgkm/vmcnt(0) @@ -7265,20 +7241,7 @@ following fence-paired-atomic. - fence release - system *none* 1. buffer_wbl2 - - - If OpenCL and - address space is - local, omit. - - Must happen before - following s_waitcnt. - - Performs L2 writeback to - ensure previous - global/generic - store/atomicrmw are - visible at system scope. - - 2. s_waitcnt lgkmcnt(0) & + fence release - system *none* 1. s_waitcnt lgkmcnt(0) & vmcnt(0) - If TgSplit execution mode, @@ -7568,17 +7531,7 @@ will not see stale global data. - atomicrmw acq_rel - system - global 1. buffer_wbl2 - - - Must happen before - following s_waitcnt. - - Performs L2 writeback to - ensure previous - global/generic - store/atomicrmw are - visible at system scope. - - 2. s_waitcnt lgkmcnt(0) & + atomicrmw acq_rel - system - global 1. s_waitcnt lgkmcnt(0) & vmcnt(0) - If TgSplit execution mode, @@ -7619,12 +7572,11 @@ atomicrmw that is being released. - 3. buffer/global_atomic - scc=1 - 4. s_waitcnt vmcnt(0) + 2. buffer/global_atomic + 3. s_waitcnt vmcnt(0) - Must happen before - following buffer_invl2 and + following buffer_wbinvl1_vol. - Ensures the atomicrmw has @@ -7632,8 +7584,7 @@ invalidating the caches. - 5. buffer_invl2; - buffer_wbinvl1_vol + 4. buffer_wbinvl1_vol - Must happen before any following @@ -7641,12 +7592,12 @@ load/load atomic/atomicrmw. - Ensures that - following loads - will not see stale - MTYPE NC global data. + following + loads will not see + stale L1 global data. MTYPE RW and CC memory will - never be stale due to the - memory probes. + never be stale in L2 due to + the memory probes. atomicrmw acq_rel - agent - generic 1. s_waitcnt lgkmcnt(0) & vmcnt(0) @@ -7718,17 +7669,7 @@ will not see stale global data. - atomicrmw acq_rel - system - generic 1. buffer_wbl2 - - - Must happen before - following s_waitcnt. - - Performs L2 writeback to - ensure previous - global/generic - store/atomicrmw are - visible at system scope. - - 2. s_waitcnt lgkmcnt(0) & + atomicrmw acq_rel - system - generic 1. s_waitcnt lgkmcnt(0) & vmcnt(0) - If TgSplit execution mode, @@ -7769,8 +7710,8 @@ atomicrmw that is being released. - 3. flat_atomic scc=1 - 4. s_waitcnt vmcnt(0) & + 2. flat_atomic + 3. s_waitcnt vmcnt(0) & lgkmcnt(0) - If TgSplit execution mode, @@ -7778,7 +7719,7 @@ - If OpenCL, omit lgkmcnt(0). - Must happen before - following buffer_invl2 and + following buffer_wbinvl1_vol. - Ensures the atomicrmw has @@ -7786,8 +7727,7 @@ invalidating the caches. - 5. buffer_invl2; - buffer_wbinvl1_vol + 4. buffer_wbinvl1_vol - Must happen before any following @@ -7795,12 +7735,12 @@ load/load atomic/atomicrmw. - Ensures that - following loads - will not see stale - MTYPE NC global data. + following + loads will not see + stale L1 global data. MTYPE RW and CC memory will - never be stale due to the - memory probes. + never be stale in L2 due to + the memory probes. fence acq_rel - singlethread *none* *none* - wavefront @@ -8010,20 +7950,7 @@ requirements of acquire. - fence acq_rel - system *none* 1. buffer_wbl2 - - - If OpenCL and - address space is - local, omit. - - Must happen before - following s_waitcnt. - - Performs L2 writeback to - ensure previous - global/generic - store/atomicrmw are - visible at system scope. - - 2. s_waitcnt lgkmcnt(0) & + fence acq_rel - system *none* 1. s_waitcnt lgkmcnt(0) & vmcnt(0) - If TgSplit execution mode, @@ -8064,7 +7991,7 @@ atomic/store atomic/atomicrmw. - Must happen before - the following buffer_invl2 and + the following buffer_wbinvl1_vol. - Ensures that the preceding @@ -8103,8 +8030,7 @@ requirements of release. - 3. buffer_invl2; - buffer_wbinvl1_vol + 2. buffer_wbinvl1_vol - Must happen before any following @@ -8113,12 +8039,12 @@ atomic/store/store atomic/atomicrmw. - Ensures that - following loads - will not see stale - MTYPE NC global data. + following + loads will not see + stale L1 global data. MTYPE RW and CC memory will - never be stale due to the - memory probes. + never be stale in L2 due to + the memory probes. **Sequential Consistent Atomic** ------------------------------------------------------------------------------------ diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -420,14 +420,6 @@ }; class SIGfx90ACacheControl : public SIGfx7CacheControl { -protected: - - /// Sets SCC bit to "true" if present in \p MI. Returns true if \p MI - /// is modified, false otherwise. - bool enableSCCBit(const MachineBasicBlock::iterator &MI) const { - return enableNamedBit(MI, AMDGPU::CPol::SCC);; - } - public: SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}; @@ -460,12 +452,6 @@ SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const override; - - bool insertRelease(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - bool IsCrossAddrSpaceOrdering, - Position Pos) const override; }; class SIGfx10CacheControl : public SIGfx7CacheControl { @@ -1108,9 +1094,6 @@ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { switch (Scope) { case SIAtomicScope::SYSTEM: - Changed |= enableSCCBit(MI); - Changed |= enableGLCBit(MI); - break; case SIAtomicScope::AGENT: Changed |= enableGLCBit(MI); break; @@ -1150,8 +1133,6 @@ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { switch (Scope) { case SIAtomicScope::SYSTEM: - Changed |= enableSCCBit(MI); - LLVM_FALLTHROUGH; case SIAtomicScope::AGENT: /// Do not set glc for store atomic operations as they implicitly write /// through the L1 cache. @@ -1187,8 +1168,6 @@ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { switch (Scope) { case SIAtomicScope::SYSTEM: - Changed |= enableSCCBit(MI); - LLVM_FALLTHROUGH; case SIAtomicScope::AGENT: /// Do not set glc for RMW atomic operations as they implicitly bypass /// the L1 cache, and the glc bit is instead used to indicate if they are @@ -1227,7 +1206,6 @@ if (Op == SIMemOp::LOAD) { Changed |= enableGLCBit(MI); } - Changed |= enableSCCBit(MI); // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not @@ -1287,26 +1265,9 @@ bool Changed = false; - MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - - if (Pos == Position::AFTER) - ++MI; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { switch (Scope) { case SIAtomicScope::SYSTEM: - // Ensures that following loads will not see stale remote VMEM data or - // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and - // CC will never be stale due to the local memory probes. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); - // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the - // hardware does not reorder memory operations by the same wave with - // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to - // remove any cache lines of earlier writes by the same wave and ensures - // later reads by the same wave will refetch the cache lines. - Changed = true; - break; case SIAtomicScope::AGENT: // Same as GFX7. break; @@ -1336,62 +1297,11 @@ /// Other address spaces do not have a cache. - if (Pos == Position::AFTER) - --MI; - Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); return Changed; } -bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, - SIAtomicScope Scope, - SIAtomicAddrSpace AddrSpace, - bool IsCrossAddrSpaceOrdering, - Position Pos) const { - bool Changed = false; - - MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - - if (Pos == Position::AFTER) - ++MI; - - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { - switch (Scope) { - case SIAtomicScope::SYSTEM: - // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the - // hardware does not reorder memory operations by the same wave with - // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed - // to initiate writeback of any dirty cache lines of earlier writes by the - // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the - // writeback has completed. - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)); - // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT - // vmcnt(0)" needed by the "BUFFER_WBL2". - Changed = true; - break; - case SIAtomicScope::AGENT: - case SIAtomicScope::WORKGROUP: - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // Same as GFX7. - break; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - } - - if (Pos == Position::AFTER) - --MI; - - Changed |= - SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, - IsCrossAddrSpaceOrdering, Pos); - - return Changed; -} - bool SIGfx10CacheControl::enableLoadCacheBypass( const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -1767,7 +1677,7 @@ Position::BEFORE); // TODO: If both release and invalidate are happening they could be combined - // to use the single "BUFFER_WBL2" instruction. This could be done by + // to use the single "BUFFER_WBINV*" instruction. This could be done by // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to // track cache invalidate and write back instructions. diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll @@ -431,7 +431,7 @@ ; GFX7: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX90A: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32, ; GFX7: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}} -; GFX90A: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc scc{{$}} +; GFX90A: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc{{$}} ; GCN: s_setpc_b64 define void @too_many_args_use_workitem_id_x_byval( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, @@ -518,7 +518,7 @@ ; GCN: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} ; GFX7: buffer_store_dword [[K]], off, s[0:3], s33{{$}} -; GFX90A: buffer_store_dword [[K]], off, s[0:3], s33 scc{{$}} +; GFX90A: buffer_store_dword [[K]], off, s[0:3], s33{{$}} ; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} ; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4 ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], @@ -551,9 +551,9 @@ ; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v12 ; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v30{{$}} ; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v0{{$}} -; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v29, off scc{{$}} -; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v30, off scc{{$}} -; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v32, off scc{{$}} +; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v29, off{{$}} +; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v30, off{{$}} +; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v32, off{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -12,7 +12,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_add v2, v0, v1, s[0:1] glc scc +; CHECK-NEXT: global_atomic_add v2, v0, v1, s[0:1] glc ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -34,7 +34,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_sub v2, v0, v1, s[0:1] glc scc +; CHECK-NEXT: global_atomic_sub v2, v0, v1, s[0:1] glc ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -56,7 +56,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_and v2, v0, v1, s[0:1] glc scc +; CHECK-NEXT: global_atomic_and v2, v0, v1, s[0:1] glc ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -78,7 +78,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_or v2, v0, v1, s[0:1] glc scc +; CHECK-NEXT: global_atomic_or v2, v0, v1, s[0:1] glc ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -100,7 +100,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_xor v2, v0, v1, s[0:1] glc scc +; CHECK-NEXT: global_atomic_xor v2, v0, v1, s[0:1] glc ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -130,7 +130,7 @@ ; CHECK-NEXT: v_not_b32_e32 v0, v1 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_or_b32_e32 v0, -2, v0 -; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc +; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -156,7 +156,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[0:1] glc scc +; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[0:1] glc ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -178,7 +178,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[0:1] glc scc +; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[0:1] glc ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -200,7 +200,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[0:1] glc scc +; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[0:1] glc ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -222,7 +222,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[0:1] glc scc +; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[0:1] glc ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -245,7 +245,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v0, 2 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_cmpswap v2, v2, v[0:1], s[0:1] glc scc +; CHECK-NEXT: global_atomic_cmpswap v2, v2, v[0:1], s[0:1] glc ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -268,7 +268,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_swap v2, v0, v1, s[0:1] glc scc +; CHECK-NEXT: global_atomic_swap v2, v0, v1, s[0:1] glc ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_waitcnt vmcnt(0) @@ -339,7 +339,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v1, v0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_add_f32_e32 v0, 1.0, v1 -; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc +; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -374,7 +374,7 @@ ; CHECK-NEXT: v_mov_b32_e32 v1, v0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_add_f32_e32 v0, -1.0, v1 -; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc +; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/expand-atomicrmw-syncscope.ll b/llvm/test/CodeGen/AMDGPU/expand-atomicrmw-syncscope.ll --- a/llvm/test/CodeGen/AMDGPU/expand-atomicrmw-syncscope.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-atomicrmw-syncscope.ll @@ -1,8 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; Check that syncscope it copied from atomicrmw to cmpxchg during expansion. -; There should be no scc unless we have system scope. - ; GCN-LABEL: {{^}}expand_atomicrmw_agent: ; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}} define void @expand_atomicrmw_agent(float addrspace(1)* nocapture %arg) { @@ -60,7 +57,7 @@ } ; GCN-LABEL: {{^}}expand_atomicrmw_one_as: -; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc scc{{$}} +; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}} define void @expand_atomicrmw_one_as(float addrspace(1)* nocapture %arg) { entry: %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 syncscope("one-as") monotonic, align 4 @@ -68,7 +65,7 @@ } ; GCN-LABEL: {{^}}expand_atomicrmw_system: -; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc scc{{$}} +; GCN: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], off glc{{$}} define void @expand_atomicrmw_system(float addrspace(1)* nocapture %arg) { entry: %ret = atomicrmw fadd float addrspace(1)* %arg, float 1.000000e+00 monotonic, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -424,11 +424,8 @@ ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 -; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc scc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -473,11 +470,8 @@ ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -544,11 +538,8 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 -; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc scc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] @@ -592,11 +583,8 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] @@ -650,13 +638,11 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc scc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_invl2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] @@ -700,11 +686,8 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -731,13 +714,11 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 -; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc scc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: buffer_invl2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] @@ -780,11 +761,8 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc scc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -70,11 +70,8 @@ ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 @@ -570,11 +567,8 @@ ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -35,8 +35,6 @@ ; GFX90A-TGSPLIT-LABEL: singlethread_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("singlethread") acquire ret void @@ -70,8 +68,6 @@ ; GFX90A-TGSPLIT-LABEL: singlethread_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("singlethread") release ret void @@ -105,8 +101,6 @@ ; GFX90A-TGSPLIT-LABEL: singlethread_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("singlethread") acq_rel ret void @@ -140,8 +134,6 @@ ; GFX90A-TGSPLIT-LABEL: singlethread_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("singlethread") seq_cst ret void @@ -175,8 +167,6 @@ ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("singlethread-one-as") acquire ret void @@ -210,8 +200,6 @@ ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("singlethread-one-as") release ret void @@ -245,8 +233,6 @@ ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("singlethread-one-as") acq_rel ret void @@ -280,8 +266,6 @@ ; GFX90A-TGSPLIT-LABEL: singlethread_one_as_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("singlethread-one-as") seq_cst ret void @@ -315,8 +299,6 @@ ; GFX90A-TGSPLIT-LABEL: wavefront_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("wavefront") acquire ret void @@ -350,8 +332,6 @@ ; GFX90A-TGSPLIT-LABEL: wavefront_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("wavefront") release ret void @@ -385,8 +365,6 @@ ; GFX90A-TGSPLIT-LABEL: wavefront_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("wavefront") acq_rel ret void @@ -420,8 +398,6 @@ ; GFX90A-TGSPLIT-LABEL: wavefront_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("wavefront") seq_cst ret void @@ -455,8 +431,6 @@ ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("wavefront-one-as") acquire ret void @@ -490,8 +464,6 @@ ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("wavefront-one-as") release ret void @@ -525,8 +497,6 @@ ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("wavefront-one-as") acq_rel ret void @@ -560,8 +530,6 @@ ; GFX90A-TGSPLIT-LABEL: wavefront_one_as_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("wavefront-one-as") seq_cst ret void @@ -605,8 +573,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("workgroup") acquire ret void @@ -648,8 +614,6 @@ ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("workgroup") release ret void @@ -693,8 +657,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("workgroup") acq_rel ret void @@ -738,8 +700,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("workgroup") seq_cst ret void @@ -778,8 +738,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("workgroup-one-as") acquire ret void @@ -816,8 +774,6 @@ ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("workgroup-one-as") release ret void @@ -856,8 +812,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("workgroup-one-as") acq_rel ret void @@ -896,8 +850,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("workgroup-one-as") seq_cst ret void @@ -948,8 +900,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("agent") acquire ret void @@ -992,8 +942,6 @@ ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("agent") release ret void @@ -1044,8 +992,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("agent") acq_rel ret void @@ -1096,8 +1042,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("agent") seq_cst ret void @@ -1148,8 +1092,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("agent-one-as") acquire ret void @@ -1192,8 +1134,6 @@ ; GFX90A-TGSPLIT: ; %bb.0: ; %entry ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("agent-one-as") release ret void @@ -1244,8 +1184,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("agent-one-as") acq_rel ret void @@ -1296,8 +1234,6 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("agent-one-as") seq_cst ret void @@ -1339,23 +1275,15 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: system_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence acquire ret void @@ -1391,17 +1319,13 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: system_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence release ret void @@ -1443,23 +1367,15 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: system_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence acq_rel ret void @@ -1501,23 +1417,15 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: system_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence seq_cst ret void @@ -1559,23 +1467,15 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("one-as") acquire ret void @@ -1611,17 +1511,13 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("one-as") release ret void @@ -1663,23 +1559,15 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("one-as") acq_rel ret void @@ -1721,23 +1609,15 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm -; -; entry: fence syncscope("one-as") seq_cst ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -150,7 +150,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -163,7 +163,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -243,13 +243,12 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -259,9 +258,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -349,13 +346,12 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -366,9 +362,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -508,7 +502,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store: @@ -518,7 +512,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -586,9 +580,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_store: @@ -598,9 +591,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -668,9 +660,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store: @@ -680,9 +671,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -744,7 +734,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw: @@ -754,7 +744,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -827,10 +817,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -841,9 +829,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -913,9 +899,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw: @@ -925,9 +910,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1006,12 +990,9 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1022,11 +1003,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1107,12 +1085,9 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1123,11 +1098,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1207,11 +1179,10 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1222,9 +1193,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -1312,13 +1281,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1329,11 +1296,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -1421,13 +1385,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1438,11 +1400,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -1520,7 +1479,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: @@ -1530,7 +1489,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -1616,10 +1575,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1630,9 +1587,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1715,9 +1670,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: @@ -1727,9 +1681,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -1821,12 +1774,9 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1837,11 +1787,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1935,12 +1882,9 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1951,11 +1895,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2043,10 +1984,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2057,9 +1996,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2153,12 +2090,9 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2169,11 +2103,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2267,12 +2198,9 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2283,11 +2211,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2381,12 +2306,9 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2397,11 +2319,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2495,12 +2414,9 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2511,11 +2427,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2616,11 +2529,10 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2631,9 +2543,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2743,13 +2653,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2760,11 +2668,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2874,13 +2779,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2891,11 +2794,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -2999,11 +2899,10 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3014,9 +2913,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -3126,13 +3023,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3143,11 +3038,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -3257,13 +3149,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3274,11 +3164,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -3388,13 +3275,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3405,11 +3290,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -3519,13 +3401,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3536,11 +3416,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -3698,7 +3575,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3711,7 +3588,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -3792,9 +3669,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -3809,9 +3684,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -3900,9 +3773,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -3918,9 +3789,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 @@ -4060,7 +3929,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store: @@ -4070,7 +3939,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -4138,9 +4007,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store: @@ -4150,9 +4018,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -4220,9 +4087,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store: @@ -4232,9 +4098,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -4296,7 +4161,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: @@ -4306,7 +4171,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -4377,9 +4242,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -4391,9 +4254,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -4463,9 +4324,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: @@ -4475,9 +4335,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -4554,11 +4413,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -4570,11 +4426,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -4653,11 +4506,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -4669,11 +4519,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -4753,9 +4600,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4769,9 +4614,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4859,11 +4702,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4877,11 +4717,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4969,11 +4806,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -4987,11 +4821,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5069,7 +4900,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: @@ -5079,7 +4910,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5163,9 +4994,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5177,9 +5006,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -5262,9 +5089,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: @@ -5274,9 +5100,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5366,11 +5191,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5382,11 +5204,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -5478,11 +5297,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5494,11 +5310,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -5584,9 +5397,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5598,9 +5409,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -5692,11 +5501,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5708,11 +5514,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -5804,11 +5607,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5820,11 +5620,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -5916,11 +5713,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5932,11 +5726,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6028,11 +5819,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6044,11 +5832,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6150,9 +5935,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6166,9 +5949,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6279,11 +6060,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6297,11 +6075,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6412,11 +6187,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6430,11 +6202,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6539,9 +6308,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6555,9 +6322,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6668,11 +6433,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6686,11 +6448,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6801,11 +6560,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6819,11 +6575,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6934,11 +6687,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -6952,11 +6702,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -7067,11 +6814,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -7085,11 +6829,8 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -174,7 +174,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -184,7 +184,7 @@ ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -274,9 +274,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] @@ -287,9 +285,7 @@ ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] @@ -385,9 +381,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] @@ -398,9 +392,7 @@ ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] @@ -556,7 +548,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_store: @@ -566,7 +558,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -644,9 +636,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_release_store: @@ -656,9 +647,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -736,9 +726,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_store: @@ -748,9 +737,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -821,7 +809,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_atomicrmw: @@ -831,7 +819,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -913,9 +901,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -927,9 +913,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1009,9 +993,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_release_atomicrmw: @@ -1021,9 +1004,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1112,11 +1094,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1128,11 +1107,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1223,11 +1199,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1239,11 +1212,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1332,9 +1302,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -1347,9 +1315,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -1447,11 +1413,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -1464,11 +1427,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -1566,11 +1526,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -1583,11 +1540,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -1669,7 +1623,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg: @@ -1679,7 +1633,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1769,9 +1723,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1783,9 +1735,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1873,9 +1823,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_release_monotonic_cmpxchg: @@ -1885,9 +1834,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -1984,11 +1932,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2000,11 +1945,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2103,11 +2045,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2119,11 +2058,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2215,9 +2151,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2229,9 +2163,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2330,11 +2262,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2346,11 +2275,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2449,11 +2375,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2465,11 +2388,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2568,11 +2488,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2584,11 +2501,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2687,11 +2601,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2703,11 +2614,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2806,9 +2714,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -2821,9 +2727,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -2932,11 +2836,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -2949,11 +2850,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -3062,11 +2960,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -3079,11 +2974,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -3185,9 +3077,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -3200,9 +3090,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -3311,11 +3199,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -3328,11 +3213,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -3441,11 +3323,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -3458,11 +3337,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -3571,11 +3447,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -3588,11 +3461,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -3701,11 +3571,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -3718,11 +3585,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -3903,7 +3767,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -3913,7 +3777,7 @@ ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -4003,9 +3867,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] @@ -4016,9 +3878,7 @@ ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] @@ -4114,9 +3974,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] @@ -4127,9 +3985,7 @@ ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] @@ -4285,7 +4141,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_store: @@ -4295,7 +4151,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -4373,9 +4229,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_store: @@ -4385,9 +4240,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -4465,9 +4319,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_store: @@ -4477,9 +4330,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -4550,7 +4402,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: @@ -4560,7 +4412,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -4642,9 +4494,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -4656,9 +4506,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -4738,9 +4586,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_atomicrmw: @@ -4750,9 +4597,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -4841,11 +4687,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -4857,11 +4700,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -4952,11 +4792,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -4968,11 +4805,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -5061,9 +4895,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -5076,9 +4908,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -5176,11 +5006,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -5193,11 +5020,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -5295,11 +5119,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -5312,11 +5133,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] @@ -5398,7 +5216,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: @@ -5408,7 +5226,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5498,9 +5316,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5512,9 +5328,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -5602,9 +5416,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: @@ -5614,9 +5427,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5713,11 +5525,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5729,11 +5538,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -5832,11 +5638,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5848,11 +5651,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -5944,9 +5744,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5958,9 +5756,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6059,11 +5855,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6075,11 +5868,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6178,11 +5968,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6194,11 +5981,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6297,11 +6081,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6313,11 +6094,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6416,11 +6194,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6432,11 +6207,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6535,9 +6307,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -6550,9 +6320,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -6661,11 +6429,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -6678,11 +6443,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -6791,11 +6553,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -6808,11 +6567,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -6914,9 +6670,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -6929,9 +6683,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -7040,11 +6792,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -7057,11 +6806,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -7170,11 +6916,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -7187,11 +6930,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -7300,11 +7040,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -7317,11 +7054,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -7430,11 +7164,8 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] @@ -7447,11 +7178,8 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: buffer_wbl2 -; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1]