Index: lib/Target/AMDGPU/SIMemoryLegalizer.cpp =================================================================== --- lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -51,22 +51,38 @@ private: struct MemOpInfo final { SyncScope::ID SSID = SyncScope::System; - AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent; - AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent; + AtomicOrdering Ordering = AtomicOrdering::NotAtomic; + AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; + bool IsNonTemporal = false; - MemOpInfo() = default; + MemOpInfo(SyncScope::ID SSID, + AtomicOrdering Ordering) + : SSID(SSID), + Ordering(Ordering) { + assert(Ordering != AtomicOrdering::NotAtomic); + } MemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering, AtomicOrdering FailureOrdering) : SSID(SSID), Ordering(Ordering), - FailureOrdering(FailureOrdering) {} + FailureOrdering(FailureOrdering) { + assert(Ordering != AtomicOrdering::NotAtomic); + assert(FailureOrdering != AtomicOrdering::NotAtomic); + } MemOpInfo(const MachineMemOperand *MMO) : SSID(MMO->getSyncScopeID()), Ordering(MMO->getOrdering()), - FailureOrdering(MMO->getFailureOrdering()) {} + FailureOrdering(MMO->getFailureOrdering()), + IsNonTemporal(MMO->getFlags() & MachineMemOperand::MONonTemporal) { + assert(Ordering != AtomicOrdering::NotAtomic || IsNonTemporal); + } + + bool IsAtomic() const { + return Ordering != AtomicOrdering::NotAtomic; + } }; /// \brief LLVM context. @@ -87,6 +103,34 @@ /// \brief List of atomic pseudo instructions. std::list AtomicPseudoMIs; + /// \brief Sets named bit (BitName) to "true" if present in \p MI. Returns + /// true if \p MI is modified, false otherwise. + template + bool enableNamedBit(const MachineBasicBlock::iterator &MI) const { + int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName); + if (BitIdx == -1) + return false; + + MachineOperand &Bit = MI->getOperand(BitIdx); + if (Bit.getImm() == 1) + return false; + + Bit.setImm(1); + return true; + } + + /// \brief Sets GLC bit to "true" if present in \p MI. Returns true if \p MI + /// is modified, false otherwise. + bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { + return enableNamedBit(MI); + } + + /// \brief Sets SLC bit to "true" if present in \p MI. Returns true if \p MI + /// is modified, false otherwise. + bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { + return enableNamedBit(MI); + } + /// \brief Inserts "buffer_wbinvl1_vol" instruction \p Before or after \p MI. /// Always returns true. bool insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI, @@ -96,10 +140,6 @@ bool insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI, bool Before = true) const; - /// \brief Sets GLC bit if present in \p MI. Returns true if \p MI is - /// modified, false otherwise. - bool setGLC(const MachineBasicBlock::iterator &MI) const; - /// \brief Removes all processed atomic pseudo instructions from the current /// function. Returns true if current function is modified, false otherwise. bool removeAtomicPseudoMIs(); @@ -195,19 +235,6 @@ return true; } -bool SIMemoryLegalizer::setGLC(const MachineBasicBlock::iterator &MI) const { - int GLCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::glc); - if (GLCIdx == -1) - return false; - - MachineOperand &GLC = MI->getOperand(GLCIdx); - if (GLC.getImm() == 1) - return false; - - GLC.setImm(1); - return true; -} - bool SIMemoryLegalizer::removeAtomicPseudoMIs() { if (AtomicPseudoMIs.empty()) return false; @@ -233,10 +260,11 @@ if (!(MI->mayLoad() && !MI->mayStore())) return None; if (!MI->hasOneMemOperand()) - return MemOpInfo(); + return MemOpInfo(SyncScope::System, + AtomicOrdering::SequentiallyConsistent); const MachineMemOperand *MMO = *MI->memoperands_begin(); - if (!MMO->isAtomic()) + if (!MMO->isAtomic() && !(MMO->getFlags() & MachineMemOperand::MONonTemporal)) return None; return MemOpInfo(MMO); @@ -249,10 +277,11 @@ if (!(!MI->mayLoad() && MI->mayStore())) return None; if (!MI->hasOneMemOperand()) - return MemOpInfo(); + return MemOpInfo(SyncScope::System, + AtomicOrdering::SequentiallyConsistent); const MachineMemOperand *MMO = *MI->memoperands_begin(); - if (!MMO->isAtomic()) + if (!MMO->isAtomic() && !(MMO->getFlags() & MachineMemOperand::MONonTemporal)) return None; return MemOpInfo(MMO); @@ -269,7 +298,7 @@ static_cast(MI->getOperand(1).getImm()); AtomicOrdering Ordering = static_cast(MI->getOperand(0).getImm()); - return MemOpInfo(SSID, Ordering, AtomicOrdering::NotAtomic); + return MemOpInfo(SSID, Ordering); } Optional SIMemoryLegalizer::getAtomicCmpxchgInfo( @@ -279,7 +308,9 @@ if (!(MI->mayLoad() && MI->mayStore())) return None; if (!MI->hasOneMemOperand()) - return MemOpInfo(); + return MemOpInfo(SyncScope::System, + AtomicOrdering::SequentiallyConsistent, + AtomicOrdering::SequentiallyConsistent); const MachineMemOperand *MMO = *MI->memoperands_begin(); if (!MMO->isAtomic()) @@ -297,7 +328,8 @@ if (!(MI->mayLoad() && MI->mayStore())) return None; if (!MI->hasOneMemOperand()) - return MemOpInfo(); + return MemOpInfo(SyncScope::System, + AtomicOrdering::SequentiallyConsistent); const MachineMemOperand *MMO = *MI->memoperands_begin(); if (!MMO->isAtomic()) @@ -313,11 +345,20 @@ assert(MI->mayLoad() && !MI->mayStore()); bool Changed = false; + if (MOI.IsNonTemporal) { + assert(!MOI.IsAtomic()); + Changed |= enableSLCBit(MI); + Changed |= enableGLCBit(MI); + return Changed; + } + + // Must be atomic beyond this point. + assert(MOI.IsAtomic()); if (MOI.SSID == SyncScope::System || MOI.SSID == MMI->getAgentSSID()) { if (MOI.Ordering == AtomicOrdering::Acquire || MOI.Ordering == AtomicOrdering::SequentiallyConsistent) - Changed |= setGLC(MI); + Changed |= enableGLCBit(MI); if (MOI.Ordering == AtomicOrdering::SequentiallyConsistent) Changed |= insertWaitcntVmcnt0(MI); @@ -344,6 +385,15 @@ assert(!MI->mayLoad() && MI->mayStore()); bool Changed = false; + if (MOI.IsNonTemporal) { + assert(!MOI.IsAtomic()); + Changed |= enableSLCBit(MI); + Changed |= enableGLCBit(MI); + return Changed; + } + + // Must be atomic beyond this point. + assert(MOI.IsAtomic()); if (MOI.SSID == SyncScope::System || MOI.SSID == MMI->getAgentSSID()) { if (MOI.Ordering == AtomicOrdering::Release || @@ -418,7 +468,7 @@ } else if (MOI.SSID == SyncScope::SingleThread || MOI.SSID == MMI->getWorkgroupSSID() || MOI.SSID == MMI->getWavefrontSSID()) { - Changed |= setGLC(MI); + Changed |= enableGLCBit(MI); return Changed; } else { reportUnknownSynchScope(MI); @@ -449,7 +499,7 @@ } else if (MOI.SSID == SyncScope::SingleThread || MOI.SSID == MMI->getWorkgroupSSID() || MOI.SSID == MMI->getWavefrontSSID()) { - Changed |= setGLC(MI); + Changed |= enableGLCBit(MI); return Changed; } else { reportUnknownSynchScope(MI); Index: test/CodeGen/AMDGPU/memory-legalizer-nontemporal-load.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/memory-legalizer-nontemporal-load.ll @@ -0,0 +1,97 @@ +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s + +declare i32 @llvm.amdgcn.workitem.id.x() + +; GCN-LABEL: {{^}}nontemporal_load_private_0 +; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} +define amdgpu_kernel void @nontemporal_load_private_0( + i32* %in, i32 addrspace(4)* %out) { +entry: + %val = load i32, i32* %in, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; GCN-LABEL: {{^}}nontemporal_load_private_1 +; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} +define amdgpu_kernel void @nontemporal_load_private_1( + i32* %in, i32 addrspace(4)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, i32* %in, i32 %tid + %val = load i32, i32* %val.gep, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; GCN-LABEL: {{^}}nontemporal_load_global_0 +; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0x0{{$}} +define amdgpu_kernel void @nontemporal_load_global_0( + i32 addrspace(1)* %in, i32 addrspace(4)* %out) { +entry: + %val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; GCN-LABEL: {{^}}nontemporal_load_global_1 +; GFX8: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} +; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off glc slc{{$}} +define amdgpu_kernel void @nontemporal_load_global_1( + i32 addrspace(1)* %in, i32 addrspace(4)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %val.gep, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; GCN-LABEL: {{^}}nontemporal_load_local_0 +; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @nontemporal_load_local_0( + i32 addrspace(3)* %in, i32 addrspace(4)* %out) { +entry: + %val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; GCN-LABEL: {{^}}nontemporal_load_local_1 +; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @nontemporal_load_local_1( + i32 addrspace(3)* %in, i32 addrspace(4)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid + %val = load i32, i32 addrspace(3)* %val.gep, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; GCN-LABEL: {{^}}nontemporal_load_flat_0 +; GCN: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} +define amdgpu_kernel void @nontemporal_load_flat_0( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load i32, i32 addrspace(4)* %in, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; GCN-LABEL: {{^}}nontemporal_load_flat_1 +; GCN: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} +define amdgpu_kernel void @nontemporal_load_flat_1( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tid + %val = load i32, i32 addrspace(4)* %val.gep, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +!0 = !{i32 1} Index: test/CodeGen/AMDGPU/memory-legalizer-nontemporal-store.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/memory-legalizer-nontemporal-store.ll @@ -0,0 +1,97 @@ +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s + +declare i32 @llvm.amdgcn.workitem.id.x() + +; GCN-LABEL: {{^}}nontemporal_store_private_0 +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} +define amdgpu_kernel void @nontemporal_store_private_0( + i32 addrspace(4)* %in, i32* %out) { +entry: + %val = load i32, i32 addrspace(4)* %in, align 4 + store i32 %val, i32* %out, !nontemporal !0 + ret void +} + +; GCN-LABEL: {{^}}nontemporal_store_private_1 +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} +define amdgpu_kernel void @nontemporal_store_private_1( + i32 addrspace(4)* %in, i32* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, i32 addrspace(4)* %in, align 4 + %out.gep = getelementptr inbounds i32, i32* %out, i32 %tid + store i32 %val, i32* %out.gep, !nontemporal !0 + ret void +} + +; GCN-LABEL: {{^}}nontemporal_store_global_0 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc slc{{$}} +define amdgpu_kernel void @nontemporal_store_global_0( + i32 addrspace(4)* %in, i32 addrspace(1)* %out) { +entry: + %val = load i32, i32 addrspace(4)* %in, align 4 + store i32 %val, i32 addrspace(1)* %out, !nontemporal !0 + ret void +} + +; GCN-LABEL: {{^}}nontemporal_store_global_1 +; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} +; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}} +define amdgpu_kernel void @nontemporal_store_global_1( + i32 addrspace(4)* %in, i32 addrspace(1)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, i32 addrspace(4)* %in, align 4 + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid + store i32 %val, i32 addrspace(1)* %out.gep, !nontemporal !0 + ret void +} + +; GCN-LABEL: {{^}}nontemporal_store_local_0 +; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @nontemporal_store_local_0( + i32 addrspace(4)* %in, i32 addrspace(3)* %out) { +entry: + %val = load i32, i32 addrspace(4)* %in, align 4 + store i32 %val, i32 addrspace(3)* %out, !nontemporal !0 + ret void +} + +; GCN-LABEL: {{^}}nontemporal_store_local_1 +; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @nontemporal_store_local_1( + i32 addrspace(4)* %in, i32 addrspace(3)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, i32 addrspace(4)* %in, align 4 + %out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid + store i32 %val, i32 addrspace(3)* %out.gep, !nontemporal !0 + ret void +} + +; GCN-LABEL: {{^}}nontemporal_store_flat_0 +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} +define amdgpu_kernel void @nontemporal_store_flat_0( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load i32, i32 addrspace(4)* %in, align 4 + store i32 %val, i32 addrspace(4)* %out, !nontemporal !0 + ret void +} + +; GCN-LABEL: {{^}}nontemporal_store_flat_1 +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} +define amdgpu_kernel void @nontemporal_store_flat_1( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, i32 addrspace(4)* %in, align 4 + %out.gep = getelementptr inbounds i32, i32 addrspace(4)* %out, i32 %tid + store i32 %val, i32 addrspace(4)* %out.gep, !nontemporal !0 + ret void +} + +!0 = !{i32 1}