Index: llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -110,7 +110,9 @@ BasicBlock::iterator(Load) : BB->end(); auto Q = MDR->getPointerDependencyFrom( MemoryLocation::getBeforeOrAfter(Ptr), true, StartIt, BB, Load); - if (Q.isClobber() || Q.isUnknown()) + if (Q.isClobber() || Q.isUnknown() || + // Store defines the load and thus clobbers it. + (Q.isDef() && Q.getInst()->mayWriteToMemory())) return true; } return false; Index: llvm/test/CodeGen/AMDGPU/store-clobbers-load.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/store-clobbers-load.ll @@ -0,0 +1,23 @@ +; RUN: opt -S --amdgpu-annotate-uniform < %s | FileCheck -check-prefix=OPT %s +target datalayout = "A5" + +; "load vaddr" depends on the store, so we should not mark vaddr as amdgpu.noclobber. + +; OPT-LABEL: @store_clobbers_load( +; OPT: %vaddr = bitcast [4 x i32] addrspace(5)* %alloca to <4 x i32> addrspace(5)*, !amdgpu.uniform !0 +; OPT-NEXT: %zero = load <4 x i32>, <4 x i32> addrspace(5)* %vaddr, align 16 +define amdgpu_kernel void @store_clobbers_load(i32 addrspace(1)* %out, i32 %index) { +entry: + %alloca = alloca [4 x i32], addrspace(5) + %addr0 = bitcast [4 x i32] addrspace(5)* %alloca to i32 addrspace(5)* + store i32 0, i32 addrspace(5)* %addr0 + %vaddr = bitcast [4 x i32] addrspace(5)* %alloca to <4 x i32> addrspace(5)* + %zero = load <4 x i32>, <4 x i32> addrspace(5)* %vaddr, align 16 + %one = insertelement <4 x i32> %zero, i32 1, i32 1 + %two = insertelement <4 x i32> %one, i32 2, i32 2 + %three = insertelement <4 x i32> %two, i32 3, i32 3 + store <4 x i32> %three, <4 x i32> addrspace(5)* %vaddr, align 16 + %rslt = extractelement <4 x i32> %three, i32 %index + store i32 %rslt, i32 addrspace(1)* %out, align 4 + ret void +} Index: llvm/test/CodeGen/AMDGPU/wave32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/wave32.ll +++ llvm/test/CodeGen/AMDGPU/wave32.ll @@ -231,9 +231,9 @@ ; GCN: ; %bb.{{[0-9]+}}: ; %.preheader ; GCN: BB{{.*}}: +; GCN: global_store_dword ; GFX1032: s_or_b32 [[MASK0:s[0-9]+]], [[MASK0]], vcc_lo ; GFX1064: s_or_b64 [[MASK0:s\[[0-9:]+\]]], [[MASK0]], vcc -; GCN: global_store_dword ; GFX1032: s_andn2_b32 [[MASK1:s[0-9]+]], [[MASK1]], exec_lo ; GFX1064: s_andn2_b64 [[MASK1:s\[[0-9:]+\]]], [[MASK1]], exec ; GFX1032: s_and_b32 [[MASK0]], [[MASK0]], exec_lo @@ -249,10 +249,12 @@ ; GFX1064: s_andn2_b64 exec, exec, [[ACC]] ; GCN: s_cbranch_execz ; GCN: BB{{.*}}: -; GCN: s_load_dword [[LOAD:s[0-9]+]] + ; GFX1032: s_or_b32 [[MASK1]], [[MASK1]], exec_lo ; GFX1064: s_or_b64 [[MASK1]], [[MASK1]], exec -; GCN: s_cmp_lt_i32 [[LOAD]], 11 +; GCN: global_load_dword [[LOAD:v[0-9]+]] +; GFX1032: v_cmp_gt_i32_e32 vcc_lo, 11, [[LOAD]] +; GFX1064: v_cmp_gt_i32_e32 vcc, 11, [[LOAD]] define amdgpu_kernel void @test_loop_with_if_else_break(i32 addrspace(1)* %arg) #0 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()