Index: lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -16,6 +16,7 @@ #include "AMDGPU.h" #include "AMDGPUIntrinsicInfo.h" #include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IRBuilder.h" #include "llvm/Support/Debug.h" @@ -30,6 +31,8 @@ class AMDGPUAnnotateUniformValues : public FunctionPass, public InstVisitor { DivergenceAnalysis *DA; + MemoryDependenceResults * MDR; + public: static char ID; @@ -42,6 +45,7 @@ } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); AU.setPreservesAll(); } @@ -55,6 +59,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE, "Add AMDGPU uniform metadata", false, false) INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, "Add AMDGPU uniform metadata", false, false) @@ -80,9 +85,15 @@ if (!DA->isUniform(Ptr)) return; - if (Instruction *PtrI = dyn_cast(Ptr)) - setUniformMetadata(PtrI); - + if (Instruction *PtrI = dyn_cast(Ptr)) { + setUniformMetadata(PtrI); + // TODO: DL->getPointerSize + MemDepResult mdr = + MDR->getSimplePointerDependencyFrom(MemoryLocation(Ptr), + true, BasicBlock::iterator(PtrI), PtrI->getParent(), &I); + if (!mdr.isClobber() && !mdr.isDef()) + PtrI->setMetadata("amdgpu.noclobber", MDNode::get(PtrI->getContext(), {})); + } } bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) { @@ -94,6 +105,7 @@ return false; DA = &getAnalysis(); + MDR = &getAnalysis().getMemDep(); visit(F); return true; Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -116,6 +116,7 @@ MachineFunction &MF) const override; bool isMemOpUniform(const SDNode *N) const; + bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const; bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; TargetLoweringBase::LegalizeTypeAction Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -524,6 +524,22 @@ return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); } +bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { + const MemSDNode *MemNode = cast(N); + const Value *Ptr = MemNode->getMemOperand()->getValue(); + + // UndefValue means this is a load of a kernel input. These are uniform. + // Sometimes LDS instructions have constant pointers. + // If Ptr is null, then that means this mem operand contains a + // PseudoSourceValue like GOT. + if (!Ptr || isa(Ptr) || isa(Ptr) || + isa(Ptr) || isa(Ptr)) + return true; + + const Instruction *I = dyn_cast(Ptr); + return I && I->getMetadata("amdgpu.noclobber"); +} + bool SITargetLowering::isMemOpUniform(const SDNode *N) const { const MemSDNode *MemNode = cast(N); const Value *Ptr = MemNode->getMemOperand()->getValue(); @@ -2605,11 +2621,20 @@ if (isMemOpUniform(Load)) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they - // have the same legalization requires ments as global and private + // have the same legalization requirements as global and private // loads. // LLVM_FALLTHROUGH; case AMDGPUAS::GLOBAL_ADDRESS: + { + if (isMemOpUniform(Load) && isMemOpHasNoClobberedMemOperand(Load)) + return SDValue(); + // Non-uniform loads will be selected to MUBUF instructions, so they + // have the same legalization requirements as global and private + // loads. + // + } + LLVM_FALLTHROUGH; case AMDGPUAS::FLAT_ADDRESS: if (NumElements > 4) return SplitVectorLoad(Op, DAG); Index: lib/Target/AMDGPU/SMInstructions.td =================================================================== --- lib/Target/AMDGPU/SMInstructions.td +++ lib/Target/AMDGPU/SMInstructions.td @@ -220,8 +220,10 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ auto Ld = cast(N); return Ld->getAlignment() >= 4 && - Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && - static_cast(getTargetLowering())->isMemOpUniform(N); + (Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + static_cast(getTargetLowering())->isMemOpUniform(N)) || + (Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && + static_cast(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)); }]>; def SMRDImm : ComplexPattern; Index: test/CodeGen/AMDGPU/global_smrd.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/global_smrd.ll @@ -0,0 +1,27 @@ +; RUN: llc -O2 -mtriple amdgcn--amdhsa -mcpu=fiji < %s | FileCheck %s + +; CHECK: s_load_dwordx4 +; CHECK-NOT: flat_load_dword + +define amdgpu_kernel void @snork(float addrspace(1)* readonly %arg, float addrspace(1)* nocapture %arg1) { +bb: + %tmp2 = load float, float addrspace(1)* %arg, align 4, !tbaa !8 + %tmp3 = fadd float %tmp2, 0.000000e+00 + %tmp4 = getelementptr inbounds float, float addrspace(1)* %arg, i64 1 + %tmp5 = load float, float addrspace(1)* %tmp4, align 4, !tbaa !8 + %tmp6 = fadd float %tmp3, %tmp5 + %tmp7 = getelementptr inbounds float, float addrspace(1)* %arg, i64 2 + %tmp8 = load float, float addrspace(1)* %tmp7, align 4, !tbaa !8 + %tmp9 = fadd float %tmp6, %tmp8 + %tmp10 = getelementptr inbounds float, float addrspace(1)* %arg, i64 3 + %tmp11 = load float, float addrspace(1)* %tmp10, align 4, !tbaa !8 + %tmp12 = fadd float %tmp9, %tmp11 + %tmp13 = getelementptr inbounds float, float addrspace(1)* %arg1 + store float %tmp12, float addrspace(1)* %tmp13, align 4, !tbaa !8 + ret void +} + +!8 = !{!9, !9, i64 0} +!9 = !{!"float", !10, i64 0} +!10 = !{!"omnipotent char", !11, i64 0} +!11 = !{!"Simple C/C++ TBAA"}