diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -6093,10 +6093,10 @@ ensures a previous vector memory operation has completed before executing a subsequent vector memory or LDS operation and so can be used to meet the requirements of acquire and release. - * The L2 cache of one agent can be kept coherent with other agents by using - the MTYPE CC (cache-coherent) with the PTE C-bit for memory local to the L2, - and MTYPE UC (uncached) with the PTE C-bit set for memory not local to the - L2. + * The L2 cache of one agent can be kept coherent with other agents by: + using the MTYPE RW (read-write) or MTYPE CC (cache-coherent) with the PTE + C-bit for memory local to the L2; and using the MTYPE NC (non-coherent) with + the PTE C-bit set or MTYPE UC (uncached) for memory not local to the L2. * Any local memory cache lines will be automatically invalidated by writes from CUs associated with other L2 caches, or writes from the CPU, due to @@ -6108,13 +6108,21 @@ the CPU cache due to the L2 probe filter and and the PTE C-bit being set. * Since all work-groups on the same agent share the same L2, no L2 invalidation or writeback is required for coherence. - * Since local memory reads and writes of work-groups in different agents - access memory using MTYPE CC, no L2 invalidate or writeback is required - for coherence. MTYPE CC causes write through to DRAM and local reads to be - invalidated by remote writes with with the PTE C-bit. - * Since remote memory reads and writes of work-groups in different agents - access memory using MTYPE UC, no L2 invalidate or writeback is required - for coherence. MTYPE UC causes direct accesses to DRAM. + * To ensure coherence of local and remote memory writes of work-groups in + different agents a ``buffer_wbl2`` is required. It will writeback dirty L2 + cache lines of MTYPE RW (used for local coarse grain memory) and MTYPE NC + ()used for remote coarse grain memory). Note that MTYPE CC (used for local + fine grain memory) causes write through to DRAM, and MTYPE UC (used for + remote fine grain memory) bypasses the L2, so both will never result in + dirty L2 cache lines. + * To ensure coherence of local and remote memory reads of work-groups in + different agents a ``buffer_invl2`` is required. It will invalidate L2 + cache lines with MTYPE NC (used for remote coarse grain memory). Note that + MTYPE CC (used for local fine grain memory) and MTYPE RW (used for local + coarse memory) cause local reads to be invalidated by remote writes with + with the PTE C-bit so these cache lines are not invalidated. Note that + MTYPE UC (used for remote fine grain memory) bypasses the L2, so will + never result in L2 cache lines that need to be invalidated. * PCIe access from the GPU to the CPU memory is kept coherent by using the MTYPE UC (uncached) which bypasses the L2. @@ -6384,14 +6392,15 @@ 2. s_waitcnt vmcnt(0) - Must happen before - following + following buffer_invl2 and buffer_wbinvl1_vol. - Ensures the load has completed before invalidating the cache. - 3. buffer_wbinvl1_vol + 3. buffer_invl2; + buffer_wbinvl1_vol - Must happen before any following @@ -6401,7 +6410,9 @@ - Ensures that following loads will not see - stale L1 global data. + stale L1 global data, + nor see stale L2 MTYPE + NC global data. MTYPE RW and CC memory will never be stale in L2 due to the memory probes. @@ -6444,13 +6455,15 @@ lgkmcnt(0). - Must happen before following + buffer_invl2 and buffer_wbinvl1_vol. - Ensures the flat_load has completed before invalidating the caches. - 3. buffer_wbinvl1_vol + 3. buffer_invl2; + buffer_wbinvl1_vol - Must happen before any following @@ -6459,8 +6472,10 @@ atomic/atomicrmw. - Ensures that following - L1 loads will not see - stale global data. + loads will not see + stale L1 global data, + nor see stale L2 MTYPE + NC global data. MTYPE RW and CC memory will never be stale in L2 due to the memory probes. @@ -6579,7 +6594,7 @@ 2. s_waitcnt vmcnt(0) - Must happen before - following + following buffer_invl2 and buffer_wbinvl1_vol. - Ensures the atomicrmw has @@ -6587,7 +6602,8 @@ invalidating the caches. - 3. buffer_wbinvl1_vol + 3. buffer_invl2; + buffer_wbinvl1_vol - Must happen before any following @@ -6597,8 +6613,10 @@ - Ensures that following loads will not see - stale L1 global data. - MTYPE RW and CC L2 memory + stale L1 global data, + nor see stale L2 MTYPE + NC global data. + MTYPE RW and CC memory will never be stale in L2 due to the memory probes. @@ -6641,6 +6659,7 @@ lgkmcnt(0). - Must happen before following + buffer_invl2 and buffer_wbinvl1_vol. - Ensures the atomicrmw has @@ -6648,7 +6667,8 @@ invalidating the caches. - 3. buffer_wbinvl1_vol + 3. buffer_invl2; + buffer_wbinvl1_vol - Must happen before any following @@ -6658,7 +6678,9 @@ - Ensures that following loads will not see - stale L1 global data. + stale L1 global data, + nor see stale L2 MTYPE + NC global data. MTYPE RW and CC memory will never be stale in L2 due to the memory probes. @@ -6734,7 +6756,7 @@ value read by the fence-paired-atomic. - 3. buffer_wbinvl1_vol + 2. buffer_wbinvl1_vol - If not TgSplit execution mode, omit. @@ -6872,7 +6894,7 @@ termed the fence-paired-atomic). - Must happen before - the following + the following buffer_invl2 and buffer_wbinvl1_vol. - Ensures that the fence-paired atomic @@ -6887,7 +6909,8 @@ the fence-paired-atomic. - 2. buffer_wbinvl1_vol + 2. buffer_invl2; + buffer_wbinvl1_vol - Must happen before any following global/generic @@ -6897,7 +6920,9 @@ - Ensures that following loads will not see - stale L1 global data. + stale L1 global data, + nor see stale L2 MTYPE + NC global data. MTYPE RW and CC memory will never be stale in L2 due to the memory probes. @@ -6991,8 +7016,18 @@ released. 2. buffer/global/flat_store - store atomic release - system - global 1. s_waitcnt lgkmcnt(0) & - - generic vmcnt(0) + store atomic release - system - global 1. buffer_wbl2 + - generic + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at system scope. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) - If TgSplit execution mode, omit lgkmcnt(0). @@ -7035,7 +7070,7 @@ store that is being released. - 2. buffer/global/flat_store + 3. buffer/global/flat_store atomicrmw release - singlethread - global 1. buffer/global/flat_atomic - wavefront - generic atomicrmw release - singlethread - local *If TgSplit execution mode, @@ -7123,8 +7158,18 @@ is being released. 2. buffer/global/flat_atomic - atomicrmw release - system - global 1. s_waitcnt lgkmcnt(0) & - - generic vmcnt(0) + atomicrmw release - system - global 1. buffer_wbl2 + - generic + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at system scope. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) - If TgSplit execution mode, omit lgkmcnt(0). @@ -7165,7 +7210,7 @@ store that is being released. - 2. buffer/global/flat_atomic + 3. buffer/global/flat_atomic fence release - singlethread *none* *none* - wavefront fence release - workgroup *none* 1. s_waitcnt lgkm/vmcnt(0) @@ -7298,7 +7343,20 @@ following fence-paired-atomic. - fence release - system *none* 1. s_waitcnt lgkmcnt(0) & + fence release - system *none* 1. buffer_wbl2 + + - If OpenCL and + address space is + local, omit. + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at system scope. + + 2. s_waitcnt lgkmcnt(0) & vmcnt(0) - If TgSplit execution mode, @@ -7588,7 +7646,17 @@ will not see stale global data. - atomicrmw acq_rel - system - global 1. s_waitcnt lgkmcnt(0) & + atomicrmw acq_rel - system - global 1. buffer_wbl2 + + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at system scope. + + 2. s_waitcnt lgkmcnt(0) & vmcnt(0) - If TgSplit execution mode, @@ -7629,11 +7697,11 @@ atomicrmw that is being released. - 2. buffer/global_atomic - 3. s_waitcnt vmcnt(0) + 3. buffer/global_atomic + 4. s_waitcnt vmcnt(0) - Must happen before - following + following buffer_invl2 and buffer_wbinvl1_vol. - Ensures the atomicrmw has @@ -7641,7 +7709,8 @@ invalidating the caches. - 4. buffer_wbinvl1_vol + 5. buffer_invl2; + buffer_wbinvl1_vol - Must happen before any following @@ -7651,7 +7720,9 @@ - Ensures that following loads will not see - stale L1 global data. + stale L1 global data, + nor see stale L2 MTYPE + NC global data. MTYPE RW and CC memory will never be stale in L2 due to the memory probes. @@ -7726,7 +7797,17 @@ will not see stale global data. - atomicrmw acq_rel - system - generic 1. s_waitcnt lgkmcnt(0) & + atomicrmw acq_rel - system - generic 1. buffer_wbl2 + + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at system scope. + + 2. s_waitcnt lgkmcnt(0) & vmcnt(0) - If TgSplit execution mode, @@ -7767,8 +7848,8 @@ atomicrmw that is being released. - 2. flat_atomic - 3. s_waitcnt vmcnt(0) & + 3. flat_atomic + 4. s_waitcnt vmcnt(0) & lgkmcnt(0) - If TgSplit execution mode, @@ -7776,7 +7857,7 @@ - If OpenCL, omit lgkmcnt(0). - Must happen before - following + following buffer_invl2 and buffer_wbinvl1_vol. - Ensures the atomicrmw has @@ -7784,7 +7865,8 @@ invalidating the caches. - 4. buffer_wbinvl1_vol + 5. buffer_invl2; + buffer_wbinvl1_vol - Must happen before any following @@ -7794,7 +7876,9 @@ - Ensures that following loads will not see - stale L1 global data. + stale L1 global data, + nor see stale L2 MTYPE + NC global data. MTYPE RW and CC memory will never be stale in L2 due to the memory probes. @@ -7902,7 +7986,7 @@ the acquire-fence-paired-atomic. - 3. buffer_wbinvl1_vol + 2. buffer_wbinvl1_vol - If not TgSplit execution mode, omit. @@ -8007,7 +8091,20 @@ requirements of acquire. - fence acq_rel - system *none* 1. s_waitcnt lgkmcnt(0) & + fence acq_rel - system *none* 1. buffer_wbl2 + + - If OpenCL and + address space is + local, omit. + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at system scope. + + 2. s_waitcnt lgkmcnt(0) & vmcnt(0) - If TgSplit execution mode, @@ -8048,7 +8145,7 @@ atomic/store atomic/atomicrmw. - Must happen before - the following + the following buffer_invl2 and buffer_wbinvl1_vol. - Ensures that the preceding @@ -8087,7 +8184,8 @@ requirements of release. - 2. buffer_wbinvl1_vol + 3. buffer_invl2; + buffer_wbinvl1_vol - Must happen before any following @@ -8098,7 +8196,9 @@ - Ensures that following loads will not see - stale L1 global data. + stale L1 global data, + nor see stale L2 MTYPE + NC global data. MTYPE RW and CC memory will never be stale in L2 due to the memory probes. diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -452,6 +452,12 @@ SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, Position Pos) const override; + + bool insertRelease(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + bool IsCrossAddrSpaceOrdering, + Position Pos) const override; }; class SIGfx10CacheControl : public SIGfx7CacheControl { @@ -1265,9 +1271,26 @@ bool Changed = false; + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (Pos == Position::AFTER) + ++MI; + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { switch (Scope) { case SIAtomicScope::SYSTEM: + // Ensures that following loads will not see stale remote VMEM data or + // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and + // CC will never be stale due to the local memory probes. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); + // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to + // remove any cache lines of earlier writes by the same wave and ensures + // later reads by the same wave will refetch the cache lines. + Changed = true; + break; case SIAtomicScope::AGENT: // Same as GFX7. break; @@ -1297,11 +1320,62 @@ /// Other address spaces do not have a cache. + if (Pos == Position::AFTER) + --MI; + Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); return Changed; } +bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + bool IsCrossAddrSpaceOrdering, + Position Pos) const { + bool Changed = false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (Pos == Position::AFTER) + ++MI; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed + // to initiate writeback of any dirty cache lines of earlier writes by the + // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the + // writeback has completed. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)); + // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT + // vmcnt(0)" needed by the "BUFFER_WBL2". + Changed = true; + break; + case SIAtomicScope::AGENT: + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // Same as GFX7. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if (Pos == Position::AFTER) + --MI; + + Changed |= + SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, + IsCrossAddrSpaceOrdering, Pos); + + return Changed; +} + bool SIGfx10CacheControl::enableLoadCacheBypass( const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -424,9 +424,11 @@ ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] @@ -470,9 +472,11 @@ ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] @@ -526,9 +530,11 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 +; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -571,9 +577,11 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 +; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -655,9 +663,11 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] @@ -702,9 +712,11 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] @@ -730,9 +742,11 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 +; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] @@ -775,9 +789,11 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 +; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -70,9 +70,11 @@ ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] @@ -527,9 +529,11 @@ ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -1275,13 +1275,17 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: system_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm entry: @@ -1319,11 +1323,13 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: system_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm entry: @@ -1367,13 +1373,17 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: system_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm entry: @@ -1417,13 +1427,17 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: system_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm entry: @@ -1467,13 +1481,17 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acquire_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_acquire_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm entry: @@ -1511,11 +1529,13 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_release_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_release_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_endpgm entry: @@ -1559,13 +1579,17 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_acq_rel_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm entry: @@ -1609,13 +1633,17 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: system_one_as_seq_cst_fence: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -242,6 +242,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 @@ -256,6 +257,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 @@ -341,6 +343,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 @@ -356,6 +359,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 @@ -572,6 +576,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -583,6 +588,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -652,6 +658,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -663,6 +670,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -811,6 +819,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -823,6 +832,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { @@ -891,6 +901,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -902,6 +913,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -982,9 +994,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -995,9 +1009,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { @@ -1077,9 +1093,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1090,9 +1108,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { @@ -1170,6 +1190,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1183,6 +1204,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1266,9 +1288,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1280,9 +1304,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1366,9 +1392,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1380,9 +1408,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1557,6 +1587,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1569,6 +1600,7 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { @@ -1650,6 +1682,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1661,6 +1694,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1754,9 +1788,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1767,9 +1803,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { @@ -1862,9 +1900,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1875,9 +1915,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { @@ -1966,6 +2008,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1978,6 +2021,7 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { @@ -2070,9 +2114,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2083,9 +2129,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { @@ -2178,9 +2226,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2191,9 +2241,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { @@ -2286,9 +2338,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2299,9 +2353,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { @@ -2394,9 +2450,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2407,9 +2465,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { @@ -2508,6 +2568,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2521,6 +2582,7 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2626,9 +2688,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2640,9 +2704,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2748,9 +2814,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2762,9 +2830,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2866,6 +2936,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2879,6 +2950,7 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2984,9 +3056,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2998,9 +3072,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -3106,9 +3182,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -3120,9 +3198,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -3228,9 +3308,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -3242,9 +3324,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -3350,9 +3434,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -3364,9 +3450,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -3619,6 +3707,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 @@ -3634,6 +3723,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 @@ -3723,6 +3813,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 @@ -3739,6 +3830,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 @@ -3955,6 +4047,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -3966,6 +4059,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -4035,6 +4129,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -4046,6 +4141,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -4192,6 +4288,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4204,6 +4301,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { @@ -4272,6 +4370,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -4283,6 +4382,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -4361,9 +4461,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4374,9 +4476,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { @@ -4454,9 +4558,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4467,9 +4573,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { @@ -4550,6 +4658,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4564,6 +4673,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -4650,9 +4760,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4665,9 +4777,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -4754,9 +4868,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4769,9 +4885,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -4944,6 +5062,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4956,6 +5075,7 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { @@ -5037,6 +5157,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5048,6 +5169,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -5139,9 +5261,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5152,9 +5276,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { @@ -5245,9 +5371,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5258,9 +5386,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { @@ -5347,6 +5477,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5359,6 +5490,7 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { @@ -5449,9 +5581,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5462,9 +5596,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { @@ -5555,9 +5691,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5568,9 +5706,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { @@ -5661,9 +5801,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5674,9 +5816,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { @@ -5767,9 +5911,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5780,9 +5926,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { @@ -5885,6 +6033,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5899,6 +6048,7 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6008,9 +6158,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6023,9 +6175,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6135,9 +6289,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6150,9 +6306,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6258,6 +6416,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6272,6 +6431,7 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6381,9 +6541,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6396,9 +6558,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6508,9 +6672,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6523,9 +6689,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6635,9 +6803,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6650,9 +6820,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6762,9 +6934,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6777,9 +6951,11 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-TGSPLIT-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -276,6 +276,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -287,6 +288,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -383,6 +385,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -394,6 +397,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -636,6 +640,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -647,6 +652,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -726,6 +732,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -737,6 +744,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -903,6 +911,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -915,6 +924,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { @@ -993,6 +1003,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1004,6 +1015,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1094,9 +1106,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1107,9 +1121,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { @@ -1199,9 +1215,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1212,9 +1230,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { @@ -1304,6 +1324,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1317,6 +1338,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1413,9 +1435,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1427,9 +1451,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1526,9 +1552,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1540,9 +1568,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1725,6 +1755,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1737,6 +1768,7 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { @@ -1823,6 +1855,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -1834,6 +1867,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -1932,9 +1966,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1945,9 +1981,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { @@ -2045,9 +2083,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2058,9 +2098,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { @@ -2153,6 +2195,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2165,6 +2208,7 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { @@ -2262,9 +2306,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2275,9 +2321,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { @@ -2375,9 +2423,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2388,9 +2438,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { @@ -2488,9 +2540,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2501,9 +2555,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { @@ -2601,9 +2657,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2614,9 +2672,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { @@ -2716,6 +2776,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2729,6 +2790,7 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2836,9 +2898,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2850,9 +2914,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -2960,9 +3026,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -2974,9 +3042,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -3079,6 +3149,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -3092,6 +3163,7 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -3199,9 +3271,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -3213,9 +3287,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -3323,9 +3399,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -3337,9 +3415,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -3447,9 +3527,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -3461,9 +3543,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -3571,9 +3655,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -3585,9 +3671,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -3869,6 +3957,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -3880,6 +3969,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -3976,6 +4066,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -3987,6 +4078,7 @@ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -4229,6 +4321,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -4240,6 +4333,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -4319,6 +4413,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -4330,6 +4425,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -4496,6 +4592,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4508,6 +4605,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { @@ -4586,6 +4684,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -4597,6 +4696,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -4687,9 +4787,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4700,9 +4802,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { @@ -4792,9 +4896,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -4805,9 +4911,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { @@ -4897,6 +5005,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -4910,6 +5019,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -5006,9 +5116,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5020,9 +5132,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -5119,9 +5233,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5133,9 +5249,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -5318,6 +5436,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5330,6 +5449,7 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { @@ -5416,6 +5536,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -5427,6 +5548,7 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -5525,9 +5647,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5538,9 +5662,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { @@ -5638,9 +5764,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5651,9 +5779,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { @@ -5746,6 +5876,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5758,6 +5889,7 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { @@ -5855,9 +5987,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5868,9 +6002,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { @@ -5968,9 +6104,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -5981,9 +6119,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { @@ -6081,9 +6221,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6094,9 +6236,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { @@ -6194,9 +6338,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -6207,9 +6353,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { @@ -6309,6 +6457,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6322,6 +6471,7 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6429,9 +6579,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6443,9 +6595,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6553,9 +6707,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6567,9 +6723,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6672,6 +6830,7 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6685,6 +6844,7 @@ ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6792,9 +6952,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6806,9 +6968,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -6916,9 +7080,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -6930,9 +7096,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -7040,9 +7208,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7054,9 +7224,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm @@ -7164,9 +7336,11 @@ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm @@ -7178,9 +7352,11 @@ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm