Index: lib/Target/AMDGPU/SIMemoryLegalizer.cpp =================================================================== --- lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -53,20 +53,28 @@ SyncScope::ID SSID = SyncScope::System; AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent; AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent; + bool IsAtomic = true; + bool IsNonTemporal = false; MemOpInfo() = default; MemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering, - AtomicOrdering FailureOrdering) + AtomicOrdering FailureOrdering, + bool IsAtomic = true, + bool IsNonTemporal = false) : SSID(SSID), Ordering(Ordering), - FailureOrdering(FailureOrdering) {} + FailureOrdering(FailureOrdering), + IsAtomic(IsAtomic), + IsNonTemporal(IsNonTemporal) {} MemOpInfo(const MachineMemOperand *MMO) : SSID(MMO->getSyncScopeID()), Ordering(MMO->getOrdering()), - FailureOrdering(MMO->getFailureOrdering()) {} + FailureOrdering(MMO->getFailureOrdering()), + IsAtomic(MMO->isAtomic()), + IsNonTemporal(MMO->getFlags() & MachineMemOperand::MONonTemporal) {} }; /// \brief LLVM context. @@ -87,6 +95,34 @@ /// \brief List of atomic pseudo instructions. std::list AtomicPseudoMIs; + /// \brief Sets named bit (BitName) to "true" if present in \p MI. Returns + /// true if \p MI is modified, false otherwise. + template + bool enableNamedBit(const MachineBasicBlock::iterator &MI) const { + int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName); + if (BitIdx == -1) + return false; + + MachineOperand &Bit = MI->getOperand(BitIdx); + if (Bit.getImm() == 1) + return false; + + Bit.setImm(1); + return true; + } + + /// \brief Sets GLC bit to "true" if present in \p MI. Returns true if \p MI + /// is modified, false otherwise. + bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { + return enableNamedBit(MI); + } + + /// \brief Sets SLC bit to "true" if present in \p MI. Returns true if \p MI + /// is modified, false otherwise. + bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { + return enableNamedBit(MI); + } + /// \brief Inserts "buffer_wbinvl1_vol" instruction \p Before or after \p MI. /// Always returns true. bool insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI, @@ -96,10 +132,6 @@ bool insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI, bool Before = true) const; - /// \brief Sets GLC bit if present in \p MI. Returns true if \p MI is - /// modified, false otherwise. - bool setGLC(const MachineBasicBlock::iterator &MI) const; - /// \brief Removes all processed atomic pseudo instructions from the current /// function. Returns true if current function is modified, false otherwise. bool removeAtomicPseudoMIs(); @@ -195,19 +227,6 @@ return true; } -bool SIMemoryLegalizer::setGLC(const MachineBasicBlock::iterator &MI) const { - int GLCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::glc); - if (GLCIdx == -1) - return false; - - MachineOperand &GLC = MI->getOperand(GLCIdx); - if (GLC.getImm() == 1) - return false; - - GLC.setImm(1); - return true; -} - bool SIMemoryLegalizer::removeAtomicPseudoMIs() { if (AtomicPseudoMIs.empty()) return false; @@ -236,7 +255,7 @@ return MemOpInfo(); const MachineMemOperand *MMO = *MI->memoperands_begin(); - if (!MMO->isAtomic()) + if (!MMO->isAtomic() && !(MMO->getFlags() & MachineMemOperand::MONonTemporal)) return None; return MemOpInfo(MMO); @@ -252,7 +271,7 @@ return MemOpInfo(); const MachineMemOperand *MMO = *MI->memoperands_begin(); - if (!MMO->isAtomic()) + if (!MMO->isAtomic() && !(MMO->getFlags() & MachineMemOperand::MONonTemporal)) return None; return MemOpInfo(MMO); @@ -313,11 +332,17 @@ assert(MI->mayLoad() && !MI->mayStore()); bool Changed = false; + if (MOI.IsNonTemporal) { + Changed |= enableSLCBit(MI); + Changed |= enableGLCBit(MI); + return Changed; + } + if (MOI.SSID == SyncScope::System || MOI.SSID == MMI->getAgentSSID()) { if (MOI.Ordering == AtomicOrdering::Acquire || MOI.Ordering == AtomicOrdering::SequentiallyConsistent) - Changed |= setGLC(MI); + Changed |= enableGLCBit(MI); if (MOI.Ordering == AtomicOrdering::SequentiallyConsistent) Changed |= insertWaitcntVmcnt0(MI); @@ -344,6 +369,12 @@ assert(!MI->mayLoad() && MI->mayStore()); bool Changed = false; + if (MOI.IsNonTemporal) { + Changed |= enableSLCBit(MI); + Changed |= enableGLCBit(MI); + return Changed; + } + if (MOI.SSID == SyncScope::System || MOI.SSID == MMI->getAgentSSID()) { if (MOI.Ordering == AtomicOrdering::Release || @@ -418,7 +449,7 @@ } else if (MOI.SSID == SyncScope::SingleThread || MOI.SSID == MMI->getWorkgroupSSID() || MOI.SSID == MMI->getWavefrontSSID()) { - Changed |= setGLC(MI); + Changed |= enableGLCBit(MI); return Changed; } else { reportUnknownSynchScope(MI); @@ -449,7 +480,7 @@ } else if (MOI.SSID == SyncScope::SingleThread || MOI.SSID == MMI->getWorkgroupSSID() || MOI.SSID == MMI->getWavefrontSSID()) { - Changed |= setGLC(MI); + Changed |= enableGLCBit(MI); return Changed; } else { reportUnknownSynchScope(MI); Index: test/CodeGen/AMDGPU/memory-legalizer-nontemporal-load.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-nontemporal-load.ll +++ test/CodeGen/AMDGPU/memory-legalizer-nontemporal-load.ll @@ -0,0 +1,97 @@ +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s + +declare i32 @llvm.amdgcn.workitem.id.x() + +; GCN-LABEL: {{^}}nontemporal_load_private_0 +; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} +define amdgpu_kernel void @nontemporal_load_private_0( + i32* %in, i32 addrspace(4)* %out) { +entry: + %val = load i32, i32* %in, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; GCN-LABEL: {{^}}nontemporal_load_private_1 +; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} +define amdgpu_kernel void @nontemporal_load_private_1( + i32* %in, i32 addrspace(4)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, i32* %in, i32 %tid + %val = load i32, i32* %val.gep, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; GCN-LABEL: {{^}}nontemporal_load_global_0 +; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0x0{{$}} +define amdgpu_kernel void @nontemporal_load_global_0( + i32 addrspace(1)* %in, i32 addrspace(4)* %out) { +entry: + %val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; GCN-LABEL: {{^}}nontemporal_load_global_1 +; GFX8: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} +; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off glc slc{{$}} +define amdgpu_kernel void @nontemporal_load_global_1( + i32 addrspace(1)* %in, i32 addrspace(4)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %val.gep, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; GCN-LABEL: {{^}}nontemporal_load_local_0 +; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @nontemporal_load_local_0( + i32 addrspace(3)* %in, i32 addrspace(4)* %out) { +entry: + %val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; GCN-LABEL: {{^}}nontemporal_load_local_1 +; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @nontemporal_load_local_1( + i32 addrspace(3)* %in, i32 addrspace(4)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid + %val = load i32, i32 addrspace(3)* %val.gep, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; GCN-LABEL: {{^}}nontemporal_load_flat_0 +; GCN: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} +define amdgpu_kernel void @nontemporal_load_flat_0( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load i32, i32 addrspace(4)* %in, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; GCN-LABEL: {{^}}nontemporal_load_flat_1 +; GCN: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} +define amdgpu_kernel void @nontemporal_load_flat_1( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tid + %val = load i32, i32 addrspace(4)* %val.gep, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +!0 = !{i32 1} Index: test/CodeGen/AMDGPU/memory-legalizer-nontemporal-store.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-nontemporal-store.ll +++ test/CodeGen/AMDGPU/memory-legalizer-nontemporal-store.ll @@ -0,0 +1,97 @@ +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s + +declare i32 @llvm.amdgcn.workitem.id.x() + +; GCN-LABEL: {{^}}nontemporal_store_private_0 +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} +define amdgpu_kernel void @nontemporal_store_private_0( + i32 addrspace(4)* %in, i32* %out) { +entry: + %val = load i32, i32 addrspace(4)* %in, align 4 + store i32 %val, i32* %out, !nontemporal !0 + ret void +} + +; GCN-LABEL: {{^}}nontemporal_store_private_1 +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} +define amdgpu_kernel void @nontemporal_store_private_1( + i32 addrspace(4)* %in, i32* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, i32 addrspace(4)* %in, align 4 + %out.gep = getelementptr inbounds i32, i32* %out, i32 %tid + store i32 %val, i32* %out.gep, !nontemporal !0 + ret void +} + +; GCN-LABEL: {{^}}nontemporal_store_global_0 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc slc{{$}} +define amdgpu_kernel void @nontemporal_store_global_0( + i32 addrspace(4)* %in, i32 addrspace(1)* %out) { +entry: + %val = load i32, i32 addrspace(4)* %in, align 4 + store i32 %val, i32 addrspace(1)* %out, !nontemporal !0 + ret void +} + +; GCN-LABEL: {{^}}nontemporal_store_global_1 +; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} +; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}} +define amdgpu_kernel void @nontemporal_store_global_1( + i32 addrspace(4)* %in, i32 addrspace(1)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, i32 addrspace(4)* %in, align 4 + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid + store i32 %val, i32 addrspace(1)* %out.gep, !nontemporal !0 + ret void +} + +; GCN-LABEL: {{^}}nontemporal_store_local_0 +; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @nontemporal_store_local_0( + i32 addrspace(4)* %in, i32 addrspace(3)* %out) { +entry: + %val = load i32, i32 addrspace(4)* %in, align 4 + store i32 %val, i32 addrspace(3)* %out, !nontemporal !0 + ret void +} + +; GCN-LABEL: {{^}}nontemporal_store_local_1 +; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @nontemporal_store_local_1( + i32 addrspace(4)* %in, i32 addrspace(3)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, i32 addrspace(4)* %in, align 4 + %out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid + store i32 %val, i32 addrspace(3)* %out.gep, !nontemporal !0 + ret void +} + +; GCN-LABEL: {{^}}nontemporal_store_flat_0 +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} +define amdgpu_kernel void @nontemporal_store_flat_0( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load i32, i32 addrspace(4)* %in, align 4 + store i32 %val, i32 addrspace(4)* %out, !nontemporal !0 + ret void +} + +; GCN-LABEL: {{^}}nontemporal_store_flat_1 +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} +define amdgpu_kernel void @nontemporal_store_flat_1( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, i32 addrspace(4)* %in, align 4 + %out.gep = getelementptr inbounds i32, i32 addrspace(4)* %out, i32 %tid + store i32 %val, i32 addrspace(4)* %out.gep, !nontemporal !0 + ret void +} + +!0 = !{i32 1}