Index: llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -86,6 +86,7 @@ // FIXME: This should be per-kernel. uint32_t LocalMemLimit = 0; uint32_t CurrentLocalMemUsage = 0; + unsigned MaxVGPRs; bool IsAMDGCN = false; bool IsAMDHSA = false; @@ -129,6 +130,9 @@ }; class AMDGPUPromoteAllocaToVector : public FunctionPass { +private: + unsigned MaxVGPRs; + public: static char ID; @@ -186,6 +190,13 @@ if (!ST.isPromoteAllocaEnabled()) return false; + if (IsAMDGCN) { + const GCNSubtarget &ST = TM->getSubtarget(F); + MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); + } else { + MaxVGPRs = 128; + } + bool SufficientLDS = hasSufficientLocalMem(F); bool Changed = false; BasicBlock &EntryBB = *F.begin(); @@ -409,7 +420,8 @@ } } -static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) { +static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, + unsigned MaxVGPRs) { if (DisablePromoteAllocaToVector) { LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n"); @@ -424,6 +436,13 @@ VectorTy = arrayTypeToVecType(ArrayTy); } + // Use up to 1/4 of available register budget for vectorization. + if (DL.getTypeSizeInBits(AllocaTy) * 4 > MaxVGPRs * 32) { + LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with " + << MaxVGPRs << " registers available\n"); + return false; + } + LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n"); // FIXME: There is no reason why we can't support larger arrays, we @@ -806,7 +825,7 @@ LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n'); - if (tryPromoteAllocaToVector(&I, DL)) + if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs)) return true; // Promoted to vector. if (DisablePromoteAllocaToLDS) @@ -1016,6 +1035,23 @@ if (skipFunction(F) || DisablePromoteAllocaToVector) return false; + const TargetMachine *TM; + if (auto *TPC = getAnalysisIfAvailable()) + TM = &TPC->getTM(); + else + return false; + + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F); + if (!ST.isPromoteAllocaEnabled()) + return false; + + if (TM->getTargetTriple().getArch() == Triple::amdgcn) { + const GCNSubtarget &ST = TM->getSubtarget(F); + MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); + } else { + MaxVGPRs = 128; + } + bool Changed = false; BasicBlock &EntryBB = *F.begin(); @@ -1042,7 +1078,7 @@ LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n'); Module *Mod = I.getParent()->getParent()->getParent(); - return tryPromoteAllocaToVector(&I, Mod->getDataLayout()); + return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs); } FunctionPass *llvm::createAMDGPUPromoteAlloca() { Index: llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll @@ -0,0 +1,118 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s +target datalayout = "A5" + +; OPT-LABEL: @alloca_8xi64_max1024( +; OPT-NOT: alloca +; OPT: <8 x i64> +define amdgpu_kernel void @alloca_8xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 { +entry: + %tmp = alloca [8 x i64], addrspace(5) + %x = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 0 + store i64 0, i64 addrspace(5)* %x + %tmp1 = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i64, i64 addrspace(5)* %tmp1 + store i64 %tmp2, i64 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @alloca_9xi64_max1024( +; OPT: alloca [9 x i64] +; OPT-NOT: <9 x i64> +define amdgpu_kernel void @alloca_9xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 { +entry: + %tmp = alloca [9 x i64], addrspace(5) + %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0 + store i64 0, i64 addrspace(5)* %x + %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i64, i64 addrspace(5)* %tmp1 + store i64 %tmp2, i64 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @alloca_16xi64_max512( +; OPT-NOT: alloca +; OPT: <16 x i64> +define amdgpu_kernel void @alloca_16xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 { +entry: + %tmp = alloca [16 x i64], addrspace(5) + %x = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 0 + store i64 0, i64 addrspace(5)* %x + %tmp1 = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i64, i64 addrspace(5)* %tmp1 + store i64 %tmp2, i64 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @alloca_17xi64_max512( +; OPT: alloca [17 x i64] +; OPT-NOT: <17 x i64> +define amdgpu_kernel void @alloca_17xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 { +entry: + %tmp = alloca [17 x i64], addrspace(5) + %x = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 0 + store i64 0, i64 addrspace(5)* %x + %tmp1 = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i64, i64 addrspace(5)* %tmp1 + store i64 %tmp2, i64 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @alloca_9xi128_max512( +; OPT: alloca [9 x i128] +; OPT-NOT: <9 x i128> +define amdgpu_kernel void @alloca_9xi128_max512(i128 addrspace(1)* %out, i32 %index) #1 { +entry: + %tmp = alloca [9 x i128], addrspace(5) + %x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0 + store i128 0, i128 addrspace(5)* %x + %tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i128, i128 addrspace(5)* %tmp1 + store i128 %tmp2, i128 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @alloca_9xi128_max256( +; OPT-NOT: alloca +; OPT: <9 x i128> +define amdgpu_kernel void @alloca_9xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 { +entry: + %tmp = alloca [9 x i128], addrspace(5) + %x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0 + store i128 0, i128 addrspace(5)* %x + %tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i128, i128 addrspace(5)* %tmp1 + store i128 %tmp2, i128 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @alloca_16xi128_max256( +; OPT-NOT: alloca +; OPT: <16 x i128> +define amdgpu_kernel void @alloca_16xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 { +entry: + %tmp = alloca [16 x i128], addrspace(5) + %x = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 0 + store i128 0, i128 addrspace(5)* %x + %tmp1 = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i128, i128 addrspace(5)* %tmp1 + store i128 %tmp2, i128 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @alloca_9xi256_max256( +; OPT: alloca [9 x i256] +; OPT-NOT: <9 x i256> +define amdgpu_kernel void @alloca_9xi256_max256(i256 addrspace(1)* %out, i32 %index) #2 { +entry: + %tmp = alloca [9 x i256], addrspace(5) + %x = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 0 + store i256 0, i256 addrspace(5)* %x + %tmp1 = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i256, i256 addrspace(5)* %tmp1 + store i256 %tmp2, i256 addrspace(1)* %out + ret void +} + +attributes #0 = { "amdgpu-flat-work-group-size"="1,1024" } +attributes #1 = { "amdgpu-flat-work-group-size"="1,512" } +attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }