diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -795,6 +795,8 @@ switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: + // Set L1 cache policy to MISS_EVICT. + // Note: there is no L2 cache bypass policy at the ISA level. Changed |= enableGLCBit(MI); break; case SIAtomicScope::WORKGROUP: @@ -837,8 +839,10 @@ assert(MI->mayLoad() && MI->mayStore()); bool Changed = false; - /// The L1 cache is write through so does not need to be bypassed. There is no - /// bypass control for the L2 cache at the isa level. + /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically + /// bypassed, and the GLC bit is instead used to indicate if they are + /// return or no-return. + /// Note: there is no L2 cache coherent bypass control at the ISA level. return Changed; } @@ -860,6 +864,9 @@ bool Changed = false; if (IsVolatile) { + // Set L1 cache policy to be MISS_EVICT for load instructions + // and MISS_LRU for store instructions. + // Note: there is no L2 cache bypass policy at the ISA level. if (Op == SIMemOp::LOAD) Changed |= enableGLCBit(MI); @@ -875,7 +882,8 @@ } if (IsNonTemporal) { - // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. + // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT + // for both loads and stores, and the L2 cache policy to STREAM. Changed |= enableGLCBit(MI); Changed |= enableSLCBit(MI); return Changed; @@ -1097,6 +1105,8 @@ switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: + // Set the L1 cache policy to MISS_LRU. + // Note: there is no L2 cache bypass policy at the ISA level. Changed |= enableGLCBit(MI); break; case SIAtomicScope::WORKGROUP: @@ -1206,6 +1216,9 @@ bool Changed = false; if (IsVolatile) { + // Set L1 cache policy to be MISS_EVICT for load instructions + // and MISS_LRU for store instructions. + // Note: there is no L2 cache bypass policy at the ISA level. if (Op == SIMemOp::LOAD) Changed |= enableGLCBit(MI); @@ -1221,7 +1234,8 @@ } if (IsNonTemporal) { - // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. + // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT + // for both loads and stores, and the L2 cache policy to STREAM. Changed |= enableGLCBit(MI); Changed |= enableSLCBit(MI); return Changed; @@ -1380,12 +1394,11 @@ bool Changed = false; if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { - /// TODO Do not set glc for rmw atomic operations as they - /// implicitly bypass the L0/L1 caches. - switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: + // Set the L0 and L1 cache policies to MISS_EVICT. + // Note: there is no L2 cache coherent bypass control at the ISA level. Changed |= enableGLCBit(MI); Changed |= enableDLCBit(MI); break; @@ -1434,6 +1447,9 @@ bool Changed = false; if (IsVolatile) { + // Set L0 and L1 cache policy to be MISS_EVICT for load instructions + // and MISS_LRU for store instructions. + // Note: there is no L2 cache coherent bypass control at the ISA level. if (Op == SIMemOp::LOAD) { Changed |= enableGLCBit(MI); Changed |= enableDLCBit(MI); @@ -1450,8 +1466,14 @@ } if (IsNonTemporal) { - // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions. + // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT + // and L2 cache policy to STREAM. + // For stores setting both GLC and SLC configures L0 and L1 cache policy + // to MISS_EVICT and the L2 cache policy to STREAM. + if (Op == SIMemOp::STORE) + Changed |= enableGLCBit(MI); Changed |= enableSLCBit(MI); + return Changed; } diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -216,7 +216,7 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 slc +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 glc slc ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_nontemporal_store_0: @@ -229,7 +229,7 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 slc +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 glc slc ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_0: @@ -306,7 +306,7 @@ ; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2] ; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 slc +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 glc slc ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_nontemporal_store_1: @@ -320,7 +320,7 @@ ; GFX10-CU-NEXT: flat_load_dword v2, v[1:2] ; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 slc +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 glc slc ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_1: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -239,7 +239,7 @@ ; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] slc +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] glc slc ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_nontemporal_store_0: @@ -250,7 +250,7 @@ ; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] slc +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] glc slc ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_nontemporal_store_0: @@ -335,7 +335,7 @@ ; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] slc +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] glc slc ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_nontemporal_store_1: @@ -346,7 +346,7 @@ ; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] slc +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] glc slc ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_nontemporal_store_1: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -309,7 +309,7 @@ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen slc +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: private_nontemporal_store_0: @@ -326,7 +326,7 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen slc +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: private_nontemporal_store_0: @@ -434,7 +434,7 @@ ; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-WGP-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen slc +; GFX10-WGP-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: private_nontemporal_store_1: @@ -451,7 +451,7 @@ ; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-CU-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen slc +; GFX10-CU-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: private_nontemporal_store_1: