diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/IntrinsicsR600.h" #include "llvm/Pass.h" #include "llvm/Target/TargetMachine.h" +#include "Utils/AMDGPUBaseInfo.h" #define DEBUG_TYPE "amdgpu-promote-alloca" @@ -176,6 +177,10 @@ if (IsAMDGCN) { const GCNSubtarget &ST = TM.getSubtarget(F); MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); + // A non-entry function has only 32 caller preserved registers. + // Do not promote alloca which will force spilling. + if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) + MaxVGPRs = std::min(MaxVGPRs, 32u); } else { MaxVGPRs = 128; } @@ -1107,6 +1112,10 @@ if (TM.getTargetTriple().getArch() == Triple::amdgcn) { const GCNSubtarget &ST = TM.getSubtarget(F); MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); + // A non-entry function has only 32 caller preserved registers. + // Do not promote alloca which will force spilling. + if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) + MaxVGPRs = std::min(MaxVGPRs, 32u); } else { MaxVGPRs = 128; } diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll --- a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll @@ -131,6 +131,38 @@ ret void } +; OPT-LABEL: @alloca_9xi64_max256( +; OPT-NOT: alloca +; OPT: <9 x i64> +; LIMIT32: alloca +; LIMIT32-NOT: <9 x i64> +define amdgpu_kernel void @alloca_9xi64_max256(i64 addrspace(1)* %out, i32 %index) #2 { +entry: + %tmp = alloca [9 x i64], addrspace(5) + %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0 + store i64 0, i64 addrspace(5)* %x + %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i64, i64 addrspace(5)* %tmp1 + store i64 %tmp2, i64 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @func_alloca_9xi64_max256( +; OPT: alloca +; OPT-NOT: <9 x i64> +; LIMIT32: alloca +; LIMIT32-NOT: <9 x i64> +define void @func_alloca_9xi64_max256(i64 addrspace(1)* %out, i32 %index) #2 { +entry: + %tmp = alloca [9 x i64], addrspace(5) + %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0 + store i64 0, i64 addrspace(5)* %x + %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i64, i64 addrspace(5)* %tmp1 + store i64 %tmp2, i64 addrspace(1)* %out + ret void +} + attributes #0 = { "amdgpu-flat-work-group-size"="1,1024" } attributes #1 = { "amdgpu-flat-work-group-size"="1,512" } attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }