Index: llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -142,10 +142,11 @@ } bool NotClobbered = false; + bool GlobalLoad = isGlobalLoad(I); if (PtrI) - NotClobbered = !isClobberedInFunction(&I); + NotClobbered = GlobalLoad && !isClobberedInFunction(&I); else if (isa(Ptr) || isa(Ptr)) { - if (isGlobalLoad(I) && !isClobberedInFunction(&I)) { + if (GlobalLoad && !isClobberedInFunction(&I)) { NotClobbered = true; // Lookup for the existing GEP if (noClobberClones.count(Ptr)) { Index: llvm/test/CodeGen/AMDGPU/annotate-noclobber.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/annotate-noclobber.ll @@ -0,0 +1,47 @@ +; RUN: opt -S --amdgpu-annotate-uniform < %s | FileCheck -check-prefix=OPT %s +target datalayout = "A5" + + +; OPT-LABEL: @amdgpu_noclobber_global( +; OPT: %addr = getelementptr i32, i32 addrspace(1)* %in, i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0 +; OPT-NEXT: %load = load i32, i32 addrspace(1)* %addr, align 4 +define amdgpu_kernel void @amdgpu_noclobber_global( i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %addr = getelementptr i32, i32 addrspace(1)* %in, i64 0 + %load = load i32, i32 addrspace(1)* %addr, align 4 + store i32 %load, i32 addrspace(1)* %out, align 4 + ret void +} + +; OPT-LABEL: @amdgpu_noclobber_local( +; OPT: %addr = getelementptr i32, i32 addrspace(3)* %in, i64 0, !amdgpu.uniform !0 +; OPT-NEXT: %load = load i32, i32 addrspace(3)* %addr, align 4 +define amdgpu_kernel void @amdgpu_noclobber_local( i32 addrspace(3)* %in, i32 addrspace(1)* %out) { +entry: + %addr = getelementptr i32, i32 addrspace(3)* %in, i64 0 + %load = load i32, i32 addrspace(3)* %addr, align 4 + store i32 %load, i32 addrspace(1)* %out, align 4 + ret void +} + +; OPT-LABEL: @amdgpu_noclobber_private( +; OPT: %addr = getelementptr i32, i32 addrspace(5)* %in, i64 0, !amdgpu.uniform !0 +; OPT-NEXT: %load = load i32, i32 addrspace(5)* %addr, align 4 +define amdgpu_kernel void @amdgpu_noclobber_private( i32 addrspace(5)* %in, i32 addrspace(1)* %out) { +entry: + %addr = getelementptr i32, i32 addrspace(5)* %in, i64 0 + %load = load i32, i32 addrspace(5)* %addr, align 4 + store i32 %load, i32 addrspace(1)* %out, align 4 + ret void +} + +; OPT-LABEL: @amdgpu_noclobber_flat( +; OPT: %addr = getelementptr i32, i32 addrspace(4)* %in, i64 0, !amdgpu.uniform !0 +; OPT-NEXT: %load = load i32, i32 addrspace(4)* %addr, align 4 +define amdgpu_kernel void @amdgpu_noclobber_flat( i32 addrspace(4)* %in, i32 addrspace(1)* %out) { +entry: + %addr = getelementptr i32, i32 addrspace(4)* %in, i64 0 + %load = load i32, i32 addrspace(4)* %addr, align 4 + store i32 %load, i32 addrspace(1)* %out, align 4 + ret void +} Index: llvm/test/CodeGen/AMDGPU/store-clobbers-load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/store-clobbers-load.ll +++ llvm/test/CodeGen/AMDGPU/store-clobbers-load.ll @@ -4,24 +4,24 @@ ; "load vaddr" depends on the store, so we should not mark vaddr as amdgpu.noclobber. ; OPT-LABEL: @store_clobbers_load( -; OPT: %vaddr = bitcast [4 x i32] addrspace(5)* %alloca to <4 x i32> addrspace(5)*, !amdgpu.uniform !0 -; OPT-NEXT: %zero = load <4 x i32>, <4 x i32> addrspace(5)* %vaddr, align 16 -define amdgpu_kernel void @store_clobbers_load(i32 addrspace(1)* %out, i32 %index) { +; OPT: %vaddr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %input, i64 0, !amdgpu.uniform !0 +; OPT-NEXT: %zero = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr, align 16 +define amdgpu_kernel void @store_clobbers_load( < 4 x i32> addrspace(1)* %input, i32 addrspace(1)* %out, i32 %index) { entry: - %alloca = alloca [4 x i32], addrspace(5) - %addr0 = bitcast [4 x i32] addrspace(5)* %alloca to i32 addrspace(5)* - store i32 0, i32 addrspace(5)* %addr0 - %vaddr = bitcast [4 x i32] addrspace(5)* %alloca to <4 x i32> addrspace(5)* - %zero = load <4 x i32>, <4 x i32> addrspace(5)* %vaddr, align 16 + %addr0 = bitcast <4 x i32> addrspace(1)* %input to i32 addrspace(1)* + store i32 0, i32 addrspace(1)* %addr0 + %vaddr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %input, i64 0 + %zero = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr, align 16 %one = insertelement <4 x i32> %zero, i32 1, i32 1 %two = insertelement <4 x i32> %one, i32 2, i32 2 %three = insertelement <4 x i32> %two, i32 3, i32 3 - store <4 x i32> %three, <4 x i32> addrspace(5)* %vaddr, align 16 + store <4 x i32> %three, <4 x i32> addrspace(1)* %input, align 16 %rslt = extractelement <4 x i32> %three, i32 %index store i32 %rslt, i32 addrspace(1)* %out, align 4 ret void } + declare i32 @llvm.amdgcn.workitem.id.x() @lds0 = addrspace(3) global [512 x i32] undef, align 4