diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -11247,7 +11247,7 @@ requirements of acquire, release and sequential consistency. * The L2 cache can be kept coherent with other agents on some targets, or ranges of virtual addresses can be set up to bypass it to ensure system coherence. -* On GFX10.3 a memory attached last level (MALL) cache exists for GPU memory. +* On GFX10.3 and GFX11 a memory attached last level (MALL) cache exists for GPU memory. The MALL cache is fully coherent with GPU memory and has no impact on system coherence. All agents (GPU and CPU) access GPU memory through the MALL cache. @@ -11325,12 +11325,15 @@ - !volatile & nontemporal 1. buffer/global/flat_load - slc=1 + slc=1 dlc=1 + + - If GFX10, omit dlc=1. - volatile 1. buffer/global/flat_load glc=1 dlc=1 + 2. s_waitcnt vmcnt(0) - Must happen before @@ -11353,11 +11356,17 @@ - !volatile & nontemporal 1. buffer/global/flat_store - glc=1 slc=1 + glc=1 slc=1 dlc=1 + + - If GFX10, omit dlc=1. - volatile 1. buffer/global/flat_store + dlc=1 + + - If GFX10, omit dlc=1. + 2. s_waitcnt vscnt(0) - Must happen before @@ -11393,6 +11402,9 @@ - workgroup load atomic monotonic - agent - global 1. buffer/global/flat_load - system - generic glc=1 dlc=1 + + - If GFX11, omit dlc=1. + store atomic monotonic - singlethread - global 1. buffer/global/flat_store - wavefront - generic - workgroup @@ -11504,6 +11516,9 @@ load atomic acquire - agent - global 1. buffer/global_load - system glc=1 dlc=1 + + - If GFX11, omit dlc=1. + 2. s_waitcnt vmcnt(0) - Must happen before @@ -11528,7 +11543,10 @@ stale global data. load atomic acquire - agent - generic 1. flat_load glc=1 dlc=1 - - system 2. s_waitcnt vmcnt(0) & + - system + - If GFX11, omit dlc=1. + + 2. s_waitcnt vmcnt(0) & lgkmcnt(0) - If OpenCL omit diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -545,6 +545,20 @@ Position Pos) const override; }; +class SIGfx11CacheControl : public SIGfx10CacheControl { +public: + SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {} + + bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, + SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, + bool IsNonTemporal) const override; +}; + class SIMemoryLegalizer final : public MachineFunctionPass { private: @@ -834,7 +848,9 @@ return std::make_unique(ST); if (Generation < AMDGPUSubtarget::GFX10) return std::make_unique(ST); - return std::make_unique(ST); + if (Generation < AMDGPUSubtarget::GFX11) + return std::make_unique(ST); + return std::make_unique(ST); } bool SIGfx6CacheControl::enableLoadCacheBypass( @@ -2012,6 +2028,101 @@ return Changed; } +bool SIGfx11CacheControl::enableLoadCacheBypass( + const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(MI->mayLoad() && !MI->mayStore()); + bool Changed = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + // Set the L0 and L1 cache policies to MISS_EVICT. + // Note: there is no L2 cache coherent bypass control at the ISA level. + Changed |= enableGLCBit(MI); + break; + case SIAtomicScope::WORKGROUP: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in + // CU mode all waves of a work-group are on the same CU, and so the L0 + // does not need to be bypassed. + if (!ST.isCuModeEnabled()) + Changed |= enableGLCBit(MI); + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to bypass. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + + return Changed; +} + +bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( + MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, bool IsNonTemporal) const { + + // Only handle load and store, not atomic read-modify-write insructions. The + // latter use glc to indicate if the atomic returns a result and so must not + // be used for cache control. + assert(MI->mayLoad() ^ MI->mayStore()); + + // Only update load and store, not LLVM IR atomic read-modify-write + // instructions. The latter are always marked as volatile so cannot sensibly + // handle it as do not want to pessimize all atomics. Also they do not support + // the nontemporal attribute. + assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE); + + bool Changed = false; + + if (IsVolatile) { + // Set L0 and L1 cache policy to be MISS_EVICT for load instructions + // and MISS_LRU for store instructions. + // Note: there is no L2 cache coherent bypass control at the ISA level. + if (Op == SIMemOp::LOAD) + Changed |= enableGLCBit(MI); + + // Set MALL NOALLOC for load and store instructions. + Changed |= enableDLCBit(MI); + + // Ensure operation has completed at system scope to cause all volatile + // operations to be visible outside the program in a global order. Do not + // request cross address space as only the global address space can be + // observable outside the program, so no need to cause a waitcnt for LDS + // address space operations. + Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, + Position::AFTER); + return Changed; + } + + if (IsNonTemporal) { + // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT + // and L2 cache policy to STREAM. + // For stores setting both GLC and SLC configures L0 and L1 cache policy + // to MISS_EVICT and the L2 cache policy to STREAM. + if (Op == SIMemOp::STORE) + Changed |= enableGLCBit(MI); + Changed |= enableSLCBit(MI); + + // Set MALL NOALLOC for load and store instructions. + Changed |= enableDLCBit(MI); + return Changed; + } + + return Changed; +} + bool SIMemoryLegalizer::removeAtomicPseudoMIs() { if (AtomicPseudoMIs.empty()) return false; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -258,7 +258,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -271,7 +271,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -404,7 +404,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv @@ -419,7 +419,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv @@ -566,7 +566,7 @@ ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv @@ -583,7 +583,7 @@ ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv @@ -7785,7 +7785,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7798,7 +7798,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7937,7 +7937,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv @@ -7953,7 +7953,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv @@ -8107,7 +8107,7 @@ ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv @@ -8125,7 +8125,7 @@ ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -121,7 +121,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc dlc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -134,7 +134,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc dlc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -271,7 +271,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0 ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc dlc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -285,7 +285,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s0, v0 ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc dlc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -415,7 +415,7 @@ ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_nontemporal_store_0: @@ -428,7 +428,7 @@ ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -565,7 +565,7 @@ ; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_nontemporal_store_1: @@ -579,7 +579,7 @@ ; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -258,7 +258,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -271,7 +271,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -406,7 +406,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv @@ -421,7 +421,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv @@ -570,7 +570,7 @@ ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv @@ -587,7 +587,7 @@ ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv @@ -7911,7 +7911,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -7924,7 +7924,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8065,7 +8065,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv @@ -8081,7 +8081,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv @@ -8237,7 +8237,7 @@ ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv @@ -8255,7 +8255,7 @@ ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc +; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -265,7 +265,7 @@ ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: s_endpgm ; @@ -279,7 +279,7 @@ ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { @@ -363,7 +363,7 @@ ; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: s_endpgm ; @@ -378,7 +378,7 @@ ; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -250,7 +250,7 @@ ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-WGP-NEXT: s_endpgm @@ -260,7 +260,7 @@ ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-CU-NEXT: s_endpgm @@ -390,7 +390,7 @@ ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv @@ -402,7 +402,7 @@ ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv @@ -540,7 +540,7 @@ ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv @@ -553,7 +553,7 @@ ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv @@ -7847,7 +7847,7 @@ ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-WGP-NEXT: s_endpgm @@ -7857,7 +7857,7 @@ ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-CU-NEXT: s_endpgm @@ -7987,7 +7987,7 @@ ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv @@ -7999,7 +7999,7 @@ ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv @@ -8137,7 +8137,7 @@ ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv @@ -8150,7 +8150,7 @@ ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -268,7 +268,7 @@ ; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: global_load_b32 v0, v0, s[0:1] slc +; GFX11-WGP-NEXT: global_load_b32 v0, v0, s[0:1] slc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-WGP-NEXT: s_endpgm @@ -279,7 +279,7 @@ ; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: global_load_b32 v0, v0, s[0:1] slc +; GFX11-CU-NEXT: global_load_b32 v0, v0, s[0:1] slc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX11-CU-NEXT: s_endpgm @@ -407,7 +407,7 @@ ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_nontemporal_store_0: @@ -418,7 +418,7 @@ ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -546,7 +546,7 @@ ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_nontemporal_store_1: @@ -557,7 +557,7 @@ ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -250,7 +250,7 @@ ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-WGP-NEXT: s_endpgm @@ -260,7 +260,7 @@ ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-CU-NEXT: s_endpgm @@ -392,7 +392,7 @@ ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv @@ -404,7 +404,7 @@ ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv @@ -544,7 +544,7 @@ ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv @@ -557,7 +557,7 @@ ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv @@ -7079,7 +7079,7 @@ ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-WGP-NEXT: s_endpgm @@ -7089,7 +7089,7 @@ ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-CU-NEXT: s_endpgm @@ -7221,7 +7221,7 @@ ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv @@ -7233,7 +7233,7 @@ ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv @@ -7373,7 +7373,7 @@ ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv @@ -7386,7 +7386,7 @@ ; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc +; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -275,7 +275,7 @@ ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: s_endpgm ; @@ -287,7 +287,7 @@ ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { @@ -377,7 +377,7 @@ ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: s_endpgm ; @@ -389,7 +389,7 @@ ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -159,7 +159,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 slc +; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 slc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_endpgm @@ -171,7 +171,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 slc +; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 slc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_endpgm @@ -335,7 +335,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 slc +; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 slc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_endpgm @@ -347,7 +347,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 slc +; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 slc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_endpgm @@ -508,7 +508,7 @@ ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 glc slc +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_nontemporal_store_0: @@ -520,7 +520,7 @@ ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 glc slc +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(5)* %out) { entry: @@ -683,7 +683,7 @@ ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 glc slc +; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_nontemporal_store_1: @@ -696,7 +696,7 @@ ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 glc slc +; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(5)* %out) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -351,7 +351,7 @@ ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: s_endpgm ; @@ -364,7 +364,7 @@ ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(5)* %out) { @@ -479,7 +479,7 @@ ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 +; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: s_endpgm ; @@ -493,7 +493,7 @@ ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 +; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(5)* %out) {