diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -781,6 +781,18 @@ } } + // HIP uses an extern unsized array in local address space for dynamically + // allocated shared memory. In that case, we have to disable the promotion. + for (const GlobalVariable *GV : UsedLDS) { + if (GV->hasExternalLinkage()) { + LocalMemLimit = 0; + LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated " + "local memory. Promoting to local memory " + "disabled.\n"); + return false; + } + } + const DataLayout &DL = Mod->getDataLayout(); SmallVector, 16> AllocatedSizes; AllocatedSizes.reserve(UsedLDS.size()); diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll @@ -5,6 +5,7 @@ @all_lds = internal unnamed_addr addrspace(3) global [16384 x i32] undef, align 4 @some_lds = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4 +@some_dynamic_lds = external hidden addrspace(3) global [0 x i32], align 4 @initializer_user_some = addrspace(1) global i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), align 4 @initializer_user_all = addrspace(1) global i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), align 4 @@ -62,6 +63,33 @@ ret void } +; Has a constant expression use through a single level of constant +; expression, but usage of dynamic LDS should block promotion + +; IR-LABEL: @constant_expression_uses_some_dynamic_lds( +; IR: alloca + +; ASM-LABEL: {{^}}constant_expression_uses_some_dynamic_lds: +; ASM: .amdhsa_group_segment_fixed_size 0{{$}} +define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +entry: + %stack = alloca [4 x i32], align 4, addrspace(5) + %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 + %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 + store i32 9, i32 addrspace(5)* %gep0 + store i32 10, i32 addrspace(5)* %gep1 + store i32 99, i32 addrspace(5)* %gep2 + store i32 43, i32 addrspace(5)* %gep3 + %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx + %load = load i32, i32 addrspace(5)* %arrayidx, align 4 + store i32 %load, i32 addrspace(1)* %out + %gep_dyn_lds = getelementptr inbounds [0 x i32], [0 x i32]* addrspacecast ([0 x i32] addrspace(3)* @some_dynamic_lds to [0 x i32]*), i64 0, i64 0 + store i32 1234, i32* %gep_dyn_lds, align 4 + ret void +} + declare void @callee(i8*) ; IR-LABEL: @constant_expression_uses_all_lds_multi_level( @@ -111,6 +139,29 @@ ret void } +; IR-LABEL: @constant_expression_uses_some_dynamic_lds_multi_level( +; IR: alloca + +; ASM-LABEL: {{^}}constant_expression_uses_some_dynamic_lds_multi_level: +; ASM: .amdhsa_group_segment_fixed_size 0{{$}} +define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +entry: + %stack = alloca [4 x i32], align 4, addrspace(5) + %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 + %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 + store i32 9, i32 addrspace(5)* %gep0 + store i32 10, i32 addrspace(5)* %gep1 + store i32 99, i32 addrspace(5)* %gep2 + store i32 43, i32 addrspace(5)* %gep3 + %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx + %load = load i32, i32 addrspace(5)* %arrayidx, align 4 + store i32 %load, i32 addrspace(1)* %out + call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([0 x i32], [0 x i32] addrspace(3)* @some_dynamic_lds, i32 0, i32 0) to i8 addrspace(3)*) to i8*)) + ret void +} + ; IR-LABEL: @constant_expression_uses_some_lds_global_initializer( ; IR-NOT: alloca ; IR: llvm.amdgcn.workitem.id