diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -195,6 +195,10 @@ void initializeAMDGPUPromoteAllocaPass(PassRegistry&); extern char &AMDGPUPromoteAllocaID; +FunctionPass *createAMDGPUPromoteAllocaToVector(); +void initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry&); +extern char &AMDGPUPromoteAllocaToVectorID; + Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag( TargetMachine *TM = nullptr, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -128,14 +128,39 @@ } }; +class AMDGPUPromoteAllocaToVector : public FunctionPass { +public: + static char ID; + + AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + StringRef getPassName() const override { + return "AMDGPU Promote Alloca to vector"; + } + + bool handleAlloca(AllocaInst &I); + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + FunctionPass::getAnalysisUsage(AU); + } +}; + } // end anonymous namespace char AMDGPUPromoteAlloca::ID = 0; +char AMDGPUPromoteAllocaToVector::ID = 0; INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE, "AMDGPU promote alloca to vector or LDS", false, false) +INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector", + "AMDGPU promote alloca to vector", false, false) + char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID; +char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID; bool AMDGPUPromoteAlloca::doInitialization(Module &M) { Mod = &M; @@ -982,6 +1007,43 @@ return true; } +bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) { + if (skipFunction(F) || DisablePromoteAllocaToVector) + return false; + + bool Changed = false; + BasicBlock &EntryBB = *F.begin(); + + SmallVector Allocas; + for (Instruction &I : EntryBB) { + if (AllocaInst *AI = dyn_cast(&I)) + Allocas.push_back(AI); + } + + for (AllocaInst *AI : Allocas) { + if (handleAlloca(*AI)) + Changed = true; + } + + return Changed; +} + +bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) { + // Array allocations are probably not worth handling, since an allocation of + // the array type is the canonical form. + if (!I.isStaticAlloca() || I.isArrayAllocation()) + return false; + + LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n'); + + Module *Mod = I.getParent()->getParent()->getParent(); + return tryPromoteAllocaToVector(&I, Mod->getDataLayout()); +} + FunctionPass *llvm::createAMDGPUPromoteAlloca() { return new AMDGPUPromoteAlloca(); } + +FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() { + return new AMDGPUPromoteAllocaToVector(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -235,6 +235,7 @@ initializeAMDGPUPostLegalizerCombinerPass(*PR); initializeAMDGPUPreLegalizerCombinerPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); + initializeAMDGPUPromoteAllocaToVectorPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); initializeAMDGPUPropagateAttributesEarlyPass(*PR); initializeAMDGPUPropagateAttributesLatePass(*PR); @@ -470,7 +471,7 @@ Builder.addExtension( PassManagerBuilder::EP_CGSCCOptimizerLate, - [](const PassManagerBuilder &, legacy::PassManagerBase &PM) { + [EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) { // Add infer address spaces pass to the opt pipeline after inlining // but before SROA to increase SROA opportunities. PM.add(createInferAddressSpacesPass()); @@ -478,6 +479,11 @@ // This should run after inlining to have any chance of doing anything, // and before other cleanup optimizations. PM.add(createAMDGPULowerKernelAttributesPass()); + + // Promote alloca to vector before SROA and loop unroll. If we manage + // to eliminate allocas before unroll we may choose to unroll less. + if (EnableOpt) + PM.add(createAMDGPUPromoteAllocaToVector()); }); } diff --git a/llvm/lib/Target/AMDGPU/sroa-before-unroll.ll b/llvm/lib/Target/AMDGPU/sroa-before-unroll.ll new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/sroa-before-unroll.ll @@ -0,0 +1,47 @@ +; RUN: opt -mtriple=amdgcn-- -O1 -S < %s | FileCheck %s --check-prefixes=FUNC,LOOP +; RUN: opt -mtriple=amdgcn-- -O1 -S -disable-promote-alloca-to-vector < %s | FileCheck %s --check-prefixes=FUNC,FULL-UNROLL + +target datalayout = "A5" + +; This test contains a simple loop that initializes an array declared in +; private memory. This loop would be fully unrolled if we could not SROA +; the alloca. Check that we successfully eliminate it before the unroll, +; so that we do not need to fully unroll it. + +; FUNC-LABEL: @private_memory +; LOOP-NOT: alloca +; LOOP: loop.header: +; LOOP: br i1 %{{[^,]+}}, label %exit, label %loop.header + +; FULL-UNROLL: alloca +; FULL-UNROLL-COUNT-256: store i32 {{[0-9]+}}, i32 addrspace(5)* +; FULL-UNROLL-NOT: br + +; FUNC: store i32 %{{[^,]+}}, i32 addrspace(1)* %out +define amdgpu_kernel void @private_memory(i32 addrspace(1)* %out, i32 %n) { +entry: + %alloca = alloca [16 x i32], addrspace(5) + br label %loop.header + +loop.header: + %counter = phi i32 [0, %entry], [%inc, %loop.inc] + br label %loop.body + +loop.body: + %salt = xor i32 %counter, %n + %idx = and i32 %salt, 15 + %ptr = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %idx + store i32 %counter, i32 addrspace(5)* %ptr + br label %loop.inc + +loop.inc: + %inc = add i32 %counter, 1 + %cmp = icmp sge i32 %counter, 255 + br i1 %cmp, label %exit, label %loop.header + +exit: + %gep = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %n + %load = load i32, i32 addrspace(5)* %gep + store i32 %load, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll @@ -101,6 +101,7 @@ ; GCN-O1-NEXT: Infer address spaces ; GCN-O1-NEXT: AMDGPU Kernel Attributes ; GCN-O1-NEXT: FunctionPass Manager +; GCN-O1-NEXT: AMDGPU Promote Alloca to vector ; GCN-O1-NEXT: Dominator Tree Construction ; GCN-O1-NEXT: SROA ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) @@ -401,6 +402,7 @@ ; GCN-O2-NEXT: Infer address spaces ; GCN-O2-NEXT: AMDGPU Kernel Attributes ; GCN-O2-NEXT: FunctionPass Manager +; GCN-O2-NEXT: AMDGPU Promote Alloca to vector ; GCN-O2-NEXT: Dominator Tree Construction ; GCN-O2-NEXT: SROA ; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) @@ -752,6 +754,7 @@ ; GCN-O3-NEXT: Infer address spaces ; GCN-O3-NEXT: AMDGPU Kernel Attributes ; GCN-O3-NEXT: FunctionPass Manager +; GCN-O3-NEXT: AMDGPU Promote Alloca to vector ; GCN-O3-NEXT: Dominator Tree Construction ; GCN-O3-NEXT: SROA ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)