diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -76,6 +76,11 @@ cl::desc("Disable promote alloca to LDS"), cl::init(false)); +static cl::opt PromoteAllocaToVectorLimit( + "amdgpu-promote-alloca-to-vector-limit", + cl::desc("Maximum byte size to consider promote alloca to vector"), + cl::init(0)); + // FIXME: This can create globals so should be a module pass. class AMDGPUPromoteAlloca : public FunctionPass { private: @@ -86,6 +91,7 @@ // FIXME: This should be per-kernel. uint32_t LocalMemLimit = 0; uint32_t CurrentLocalMemUsage = 0; + unsigned MaxVGPRs; bool IsAMDGCN = false; bool IsAMDHSA = false; @@ -129,6 +135,9 @@ }; class AMDGPUPromoteAllocaToVector : public FunctionPass { +private: + unsigned MaxVGPRs; + public: static char ID; @@ -186,6 +195,13 @@ if (!ST.isPromoteAllocaEnabled()) return false; + if (IsAMDGCN) { + const GCNSubtarget &ST = TM->getSubtarget(F); + MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); + } else { + MaxVGPRs = 128; + } + bool SufficientLDS = hasSufficientLocalMem(F); bool Changed = false; BasicBlock &EntryBB = *F.begin(); @@ -409,7 +425,8 @@ } } -static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) { +static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, + unsigned MaxVGPRs) { if (DisablePromoteAllocaToVector) { LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n"); @@ -424,6 +441,16 @@ VectorTy = arrayTypeToVecType(ArrayTy); } + // Use up to 1/4 of available register budget for vectorization. + unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8 + : (MaxVGPRs * 32); + + if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) { + LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with " + << MaxVGPRs << " registers available\n"); + return false; + } + LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n"); // FIXME: There is no reason why we can't support larger arrays, we @@ -806,7 +833,7 @@ LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n'); - if (tryPromoteAllocaToVector(&I, DL)) + if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs)) return true; // Promoted to vector. if (DisablePromoteAllocaToLDS) @@ -1016,6 +1043,23 @@ if (skipFunction(F) || DisablePromoteAllocaToVector) return false; + const TargetMachine *TM; + if (auto *TPC = getAnalysisIfAvailable()) + TM = &TPC->getTM(); + else + return false; + + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F); + if (!ST.isPromoteAllocaEnabled()) + return false; + + if (TM->getTargetTriple().getArch() == Triple::amdgcn) { + const GCNSubtarget &ST = TM->getSubtarget(F); + MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); + } else { + MaxVGPRs = 128; + } + bool Changed = false; BasicBlock &EntryBB = *F.begin(); @@ -1042,7 +1086,7 @@ LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n'); Module *Mod = I.getParent()->getParent()->getParent(); - return tryPromoteAllocaToVector(&I, Mod->getDataLayout()); + return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs); } FunctionPass *llvm::createAMDGPUPromoteAlloca() { diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll @@ -0,0 +1,136 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine -amdgpu-promote-alloca-to-vector-limit=32 < %s | FileCheck -check-prefix=LIMIT32 %s + +target datalayout = "A5" + +; OPT-LABEL: @alloca_8xi64_max1024( +; OPT-NOT: alloca +; OPT: <8 x i64> +; LIMIT32: alloca +; LIMIT32-NOT: <8 x i64> +define amdgpu_kernel void @alloca_8xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 { +entry: + %tmp = alloca [8 x i64], addrspace(5) + %x = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 0 + store i64 0, i64 addrspace(5)* %x + %tmp1 = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i64, i64 addrspace(5)* %tmp1 + store i64 %tmp2, i64 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @alloca_9xi64_max1024( +; OPT: alloca [9 x i64] +; OPT-NOT: <9 x i64> +; LIMIT32: alloca +; LIMIT32-NOT: <9 x i64> +define amdgpu_kernel void @alloca_9xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 { +entry: + %tmp = alloca [9 x i64], addrspace(5) + %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0 + store i64 0, i64 addrspace(5)* %x + %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i64, i64 addrspace(5)* %tmp1 + store i64 %tmp2, i64 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @alloca_16xi64_max512( +; OPT-NOT: alloca +; OPT: <16 x i64> +; LIMIT32: alloca +; LIMIT32-NOT: <16 x i64> +define amdgpu_kernel void @alloca_16xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 { +entry: + %tmp = alloca [16 x i64], addrspace(5) + %x = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 0 + store i64 0, i64 addrspace(5)* %x + %tmp1 = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i64, i64 addrspace(5)* %tmp1 + store i64 %tmp2, i64 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @alloca_17xi64_max512( +; OPT: alloca [17 x i64] +; OPT-NOT: <17 x i64> +; LIMIT32: alloca +; LIMIT32-NOT: <17 x i64> +define amdgpu_kernel void @alloca_17xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 { +entry: + %tmp = alloca [17 x i64], addrspace(5) + %x = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 0 + store i64 0, i64 addrspace(5)* %x + %tmp1 = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i64, i64 addrspace(5)* %tmp1 + store i64 %tmp2, i64 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @alloca_9xi128_max512( +; OPT: alloca [9 x i128] +; OPT-NOT: <9 x i128> +; LIMIT32: alloca +; LIMIT32-NOT: <9 x i128> +define amdgpu_kernel void @alloca_9xi128_max512(i128 addrspace(1)* %out, i32 %index) #1 { +entry: + %tmp = alloca [9 x i128], addrspace(5) + %x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0 + store i128 0, i128 addrspace(5)* %x + %tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i128, i128 addrspace(5)* %tmp1 + store i128 %tmp2, i128 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @alloca_9xi128_max256( +; OPT-NOT: alloca +; OPT: <9 x i128> +; LIMIT32: alloca +; LIMIT32-NOT: <9 x i128> +define amdgpu_kernel void @alloca_9xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 { +entry: + %tmp = alloca [9 x i128], addrspace(5) + %x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0 + store i128 0, i128 addrspace(5)* %x + %tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i128, i128 addrspace(5)* %tmp1 + store i128 %tmp2, i128 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @alloca_16xi128_max256( +; OPT-NOT: alloca +; OPT: <16 x i128> +; LIMIT32: alloca +; LIMIT32-NOT: <16 x i128> +define amdgpu_kernel void @alloca_16xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 { +entry: + %tmp = alloca [16 x i128], addrspace(5) + %x = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 0 + store i128 0, i128 addrspace(5)* %x + %tmp1 = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i128, i128 addrspace(5)* %tmp1 + store i128 %tmp2, i128 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @alloca_9xi256_max256( +; OPT: alloca [9 x i256] +; OPT-NOT: <9 x i256> +; LIMIT32: alloca +; LIMIT32-NOT: <9 x i256> +define amdgpu_kernel void @alloca_9xi256_max256(i256 addrspace(1)* %out, i32 %index) #2 { +entry: + %tmp = alloca [9 x i256], addrspace(5) + %x = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 0 + store i256 0, i256 addrspace(5)* %x + %tmp1 = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i256, i256 addrspace(5)* %tmp1 + store i256 %tmp2, i256 addrspace(1)* %out + ret void +} + +attributes #0 = { "amdgpu-flat-work-group-size"="1,1024" } +attributes #1 = { "amdgpu-flat-work-group-size"="1,512" } +attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }