Index: llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" @@ -35,7 +36,7 @@ MemoryDependenceResults *MDR; LoopInfo *LI; DenseMap noClobberClones; - bool isKernelFunc; + bool isEntryFunc; public: static char ID; @@ -127,11 +128,10 @@ auto isGlobalLoad = [&](LoadInst &Load)->bool { return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }; - // We're tracking up to the Function boundaries - // We cannot go beyond because of FunctionPass restrictions - // Thus we can ensure that memory not clobbered for memory - // operations that live in kernel only. - bool NotClobbered = isKernelFunc && !isClobberedInFunction(&I); + // We're tracking up to the Function boundaries, and cannot go beyond because + // of FunctionPass restrictions. We can ensure that is memory not clobbered + // for memory operations that are live in to entry points only. + bool NotClobbered = isEntryFunc && !isClobberedInFunction(&I); Instruction *PtrI = dyn_cast(Ptr); if (!PtrI && NotClobbered && isGlobalLoad(I)) { if (isa(Ptr) || isa(Ptr)) { @@ -170,7 +170,7 @@ DA = &getAnalysis(); MDR = &getAnalysis().getMemDep(); LI = &getAnalysis().getLoopInfo(); - isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL; + isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv()); visit(F); noClobberClones.clear(); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -481,7 +481,7 @@ ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog - %val = load float, float addrspace(1)* %ptr + %val = load volatile float, float addrspace(1)* %ptr ret float %val } @@ -508,7 +508,7 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4095 - %val = load float, float addrspace(1)* %gep + %val = load volatile float, float addrspace(1)* %gep ret float %val } @@ -541,7 +541,7 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967296 - %val = load float, float addrspace(1)* %gep + %val = load volatile float, float addrspace(1)* %gep ret float %val } @@ -574,7 +574,7 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967297 - %val = load float, float addrspace(1)* %gep + %val = load volatile float, float addrspace(1)* %gep ret float %val } @@ -601,7 +601,7 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4096 - %val = load float, float addrspace(1)* %gep + %val = load volatile float, float addrspace(1)* %gep ret float %val } @@ -626,7 +626,7 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4095 - %val = load float, float addrspace(1)* %gep + %val = load volatile float, float addrspace(1)* %gep ret float %val } @@ -651,7 +651,7 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967296 - %val = load float, float addrspace(1)* %gep + %val = load volatile float, float addrspace(1)* %gep ret float %val } @@ -676,7 +676,7 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967297 - %val = load float, float addrspace(1)* %gep + %val = load volatile float, float addrspace(1)* %gep ret float %val } @@ -701,7 +701,7 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4096 - %val = load float, float addrspace(1)* %gep + %val = load volatile float, float addrspace(1)* %gep ret float %val } @@ -734,7 +734,7 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i32 %soffset - %val = load float, float addrspace(1)* %gep + %val = load volatile float, float addrspace(1)* %gep ret float %val } @@ -759,7 +759,7 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i32 %soffset - %val = load float, float addrspace(1)* %gep + %val = load volatile float, float addrspace(1)* %gep ret float %val } @@ -785,7 +785,7 @@ ; GFX7-NEXT: ; return to shader part epilog %gep0 = getelementptr float, float addrspace(1)* %ptr, i32 %soffset %gep1 = getelementptr float, float addrspace(1)* %gep0, i32 256 - %val = load float, float addrspace(1)* %gep1 + %val = load volatile float, float addrspace(1)* %gep1 ret float %val } @@ -823,7 +823,7 @@ ; GFX7-NEXT: ; return to shader part epilog %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 256 %gep1 = getelementptr float, float addrspace(1)* %gep0, i32 %soffset - %val = load float, float addrspace(1)* %gep1 + %val = load volatile float, float addrspace(1)* %gep1 ret float %val } @@ -852,7 +852,7 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i32 %voffset - %val = load float, float addrspace(1)* %gep + %val = load volatile float, float addrspace(1)* %gep ret float %val } @@ -884,7 +884,7 @@ ; GFX7-NEXT: ; return to shader part epilog %gep0 = getelementptr float, float addrspace(1)* %ptr, i32 %voffset %gep1 = getelementptr float, float addrspace(1)* %gep0, i64 4095 - %val = load float, float addrspace(1)* %gep1 + %val = load volatile float, float addrspace(1)* %gep1 ret float %val } define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095_vgpr_offset(float addrspace(1)* inreg %ptr, i32 %voffset) { @@ -913,7 +913,7 @@ ; GFX7-NEXT: ; return to shader part epilog %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 4095 %gep1 = getelementptr float, float addrspace(1)* %gep0, i32 %voffset - %val = load float, float addrspace(1)* %gep1 + %val = load volatile float, float addrspace(1)* %gep1 ret float %val } Index: llvm/test/CodeGen/AMDGPU/global-saddr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-saddr.ll +++ llvm/test/CodeGen/AMDGPU/global-saddr.ll @@ -84,6 +84,7 @@ ret void } +; GFX9-LABEL: {{^}}_amdgpu_cs_main: ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}} ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:16{{$}} ; GFX9-NEXT: s_waitcnt @@ -92,7 +93,7 @@ define amdgpu_cs void @_amdgpu_cs_main(i64 inreg %arg) { bb: %tmp1 = inttoptr i64 %arg to <4 x i64> addrspace(1)* - %tmp2 = load <4 x i64>, <4 x i64> addrspace(1)* %tmp1, align 16 + %tmp2 = load volatile <4 x i64>, <4 x i64> addrspace(1)* %tmp1, align 16 store volatile <4 x i64> %tmp2, <4 x i64> addrspace(1)* undef ret void } Index: llvm/test/CodeGen/AMDGPU/infer-uniform-load-shader.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/infer-uniform-load-shader.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s +; +; Make sure shaders with uniform, unmodified global address space +; loads are accessed with scalar loads. + +define amdgpu_ps i32 @ps_load_uniform_global_i32_align4(i32 addrspace(1)* inreg %ptr) { +; GCN-LABEL: ps_load_uniform_global_i32_align4: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s0, s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %load = load i32, i32 addrspace(1)* %ptr, align 4 + ret i32 %load +} + +define amdgpu_cs i32 @cs_load_uniform_global_i32_align4(i32 addrspace(1)* inreg %ptr) { +; GCN-LABEL: cs_load_uniform_global_i32_align4: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s0, s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %load = load i32, i32 addrspace(1)* %ptr, align 4 + ret i32 %load +}