diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -208,6 +208,23 @@ void initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry&); extern char &AMDGPUPromoteAllocaToVectorID; +struct AMDGPUPromoteAllocaPass : PassInfoMixin { + AMDGPUPromoteAllocaPass(TargetMachine &TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + +private: + TargetMachine &TM; +}; + +struct AMDGPUPromoteAllocaToVectorPass + : PassInfoMixin { + AMDGPUPromoteAllocaToVectorPass(TargetMachine &TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + +private: + TargetMachine &TM; +}; + Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag( TargetMachine *TM = nullptr, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -83,8 +84,26 @@ // FIXME: This can create globals so should be a module pass. class AMDGPUPromoteAlloca : public FunctionPass { +public: + static char ID; + + AMDGPUPromoteAlloca() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + StringRef getPassName() const override { return "AMDGPU Promote Alloca"; } + + bool handleAlloca(AllocaInst &I, bool SufficientLDS); + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + FunctionPass::getAnalysisUsage(AU); + } +}; + +class AMDGPUPromoteAllocaImpl { private: - const TargetMachine *TM; + const TargetMachine &TM; Module *Mod = nullptr; const DataLayout *DL = nullptr; @@ -116,28 +135,14 @@ /// Check whether we have enough local memory for promotion. bool hasSufficientLocalMem(const Function &F); -public: - static char ID; - - AMDGPUPromoteAlloca() : FunctionPass(ID) {} - - bool doInitialization(Module &M) override; - bool runOnFunction(Function &F) override; - - StringRef getPassName() const override { return "AMDGPU Promote Alloca"; } - bool handleAlloca(AllocaInst &I, bool SufficientLDS); - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - FunctionPass::getAnalysisUsage(AU); - } +public: + AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {} + bool run(Function &F); }; class AMDGPUPromoteAllocaToVector : public FunctionPass { -private: - unsigned MaxVGPRs; - public: static char ID; @@ -149,8 +154,6 @@ return "AMDGPU Promote Alloca to vector"; } - bool handleAlloca(AllocaInst &I); - void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); FunctionPass::getAnalysisUsage(AU); @@ -171,32 +174,41 @@ char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID; char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID; -bool AMDGPUPromoteAlloca::doInitialization(Module &M) { - Mod = &M; - DL = &Mod->getDataLayout(); +bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + if (auto *TPC = getAnalysisIfAvailable()) { + return AMDGPUPromoteAllocaImpl(TPC->getTM()).run(F); + } return false; } -bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; +PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F, + FunctionAnalysisManager &AM) { + bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F); + if (Changed) { + PreservedAnalyses PA; + PA.preserveSet(); + return PA; + } + return PreservedAnalyses::all(); +} - if (auto *TPC = getAnalysisIfAvailable()) - TM = &TPC->getTM(); - else - return false; +bool AMDGPUPromoteAllocaImpl::run(Function &F) { + Mod = F.getParent(); + DL = &Mod->getDataLayout(); - const Triple &TT = TM->getTargetTriple(); + const Triple &TT = TM.getTargetTriple(); IsAMDGCN = TT.getArch() == Triple::amdgcn; IsAMDHSA = TT.getOS() == Triple::AMDHSA; - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); if (!ST.isPromoteAllocaEnabled()) return false; if (IsAMDGCN) { - const GCNSubtarget &ST = TM->getSubtarget(F); + const GCNSubtarget &ST = TM.getSubtarget(F); MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); } else { MaxVGPRs = 128; @@ -221,9 +233,9 @@ } std::pair -AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { +AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) { const Function &F = *Builder.GetInsertBlock()->getParent(); - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); if (!IsAMDHSA) { Function *LocalSizeYFn @@ -308,9 +320,10 @@ return std::make_pair(Y, LoadZU); } -Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) { +Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder, + unsigned N) { const AMDGPUSubtarget &ST = - AMDGPUSubtarget::get(*TM, *Builder.GetInsertBlock()->getParent()); + AMDGPUSubtarget::get(TM, *Builder.GetInsertBlock()->getParent()); Intrinsic::ID IntrID = Intrinsic::not_intrinsic; switch (N) { @@ -592,11 +605,9 @@ } } -bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca, - Value *Val, - Instruction *Inst, - int OpIdx0, - int OpIdx1) const { +bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca( + Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0, + int OpIdx1) const { // Figure out which operand is the one we might not be promoting. Value *OtherOp = Inst->getOperand(OpIdx0); if (Val == OtherOp) @@ -624,10 +635,8 @@ return true; } -bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes( - Value *BaseAlloca, - Value *Val, - std::vector &WorkList) const { +bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes( + Value *BaseAlloca, Value *Val, std::vector &WorkList) const { for (User *User : Val->users()) { if (is_contained(WorkList, User)) @@ -727,10 +736,10 @@ return true; } -bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) { +bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { FunctionType *FTy = F.getFunctionType(); - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); // If the function has any arguments in the local address space, then it's // possible these arguments require the entire local memory space, so @@ -863,7 +872,7 @@ } // FIXME: Should try to pick the most likely to be profitable allocas first. -bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { +bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { // Array allocations are probably not worth handling, since an allocation of // the array type is the canonical form. if (!I.isStaticAlloca() || I.isArrayAllocation()) @@ -904,7 +913,7 @@ if (!SufficientLDS) return false; - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction); unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; Align Alignment = @@ -1083,22 +1092,29 @@ return true; } -bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) { - if (skipFunction(F) || DisablePromoteAllocaToVector) +bool handlePromoteAllocaToVector(AllocaInst &I, unsigned MaxVGPRs) { + // Array allocations are probably not worth handling, since an allocation of + // the array type is the canonical form. + if (!I.isStaticAlloca() || I.isArrayAllocation()) return false; - const TargetMachine *TM; - if (auto *TPC = getAnalysisIfAvailable()) - TM = &TPC->getTM(); - else + LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n'); + + Module *Mod = I.getParent()->getParent()->getParent(); + return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs); +} + +bool promoteAllocasToVector(Function &F, TargetMachine &TM) { + if (DisablePromoteAllocaToVector) return false; - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); if (!ST.isPromoteAllocaEnabled()) return false; - if (TM->getTargetTriple().getArch() == Triple::amdgcn) { - const GCNSubtarget &ST = TM->getSubtarget(F); + unsigned MaxVGPRs; + if (TM.getTargetTriple().getArch() == Triple::amdgcn) { + const GCNSubtarget &ST = TM.getSubtarget(F); MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); } else { MaxVGPRs = 128; @@ -1114,23 +1130,31 @@ } for (AllocaInst *AI : Allocas) { - if (handleAlloca(*AI)) + if (handlePromoteAllocaToVector(*AI, MaxVGPRs)) Changed = true; } return Changed; } -bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) { - // Array allocations are probably not worth handling, since an allocation of - // the array type is the canonical form. - if (!I.isStaticAlloca() || I.isArrayAllocation()) +bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) { + if (skipFunction(F)) return false; + if (auto *TPC = getAnalysisIfAvailable()) { + return promoteAllocasToVector(F, TPC->getTM()); + } + return false; +} - LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n'); - - Module *Mod = I.getParent()->getParent()->getParent(); - return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs); +PreservedAnalyses +AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) { + bool Changed = promoteAllocasToVector(F, TM); + if (Changed) { + PreservedAnalyses PA; + PA.preserveSet(); + return PA; + } + return PreservedAnalyses::all(); } FunctionPass *llvm::createAMDGPUPromoteAlloca() { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -29,6 +29,7 @@ #include "SIMachineFunctionInfo.h" #include "SIMachineScheduler.h" #include "TargetInfo/AMDGPUTargetInfo.h" +#include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" @@ -488,8 +489,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB, bool DebugPassManager) { PB.registerPipelineParsingCallback( - [](StringRef PassName, FunctionPassManager &PM, - ArrayRef) { + [this](StringRef PassName, FunctionPassManager &PM, + ArrayRef) { if (PassName == "amdgpu-simplifylib") { PM.addPass(AMDGPUSimplifyLibCallsPass()); return true; @@ -498,6 +499,14 @@ PM.addPass(AMDGPUUseNativeCallsPass()); return true; } + if (PassName == "amdgpu-promote-alloca") { + PM.addPass(AMDGPUPromoteAllocaPass(*this)); + return true; + } + if (PassName == "amdgpu-promote-alloca-to-vector") { + PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); + return true; + } return false; }); @@ -510,6 +519,18 @@ FPM.addPass(AMDGPUSimplifyLibCallsPass()); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); }); + + PB.registerCGSCCOptimizerLateEPCallback( + [this, DebugPassManager](CGSCCPassManager &PM, + PassBuilder::OptimizationLevel Level) { + if (Level != PassBuilder::OptimizationLevel::O0) { + FunctionPassManager FPM(DebugPassManager); + // Promote alloca to vector before SROA and loop unroll. If we manage + // to eliminate allocas before unroll we may choose to unroll less. + FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); + PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); + } + }); } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll --- a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll @@ -1,5 +1,7 @@ ; RUN: opt -mtriple=amdgcn-- -O1 -S < %s | FileCheck %s --check-prefixes=FUNC,LOOP +; RUN: opt -mtriple=amdgcn-- -passes='default' -S < %s | FileCheck %s --check-prefixes=FUNC,LOOP ; RUN: opt -mtriple=amdgcn-- -O1 -S -disable-promote-alloca-to-vector < %s | FileCheck %s --check-prefixes=FUNC,FULL-UNROLL +; RUN: opt -mtriple=amdgcn-- -passes='default' -S -disable-promote-alloca-to-vector < %s | FileCheck %s --check-prefixes=FUNC,FULL-UNROLL target datalayout = "A5" diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll @@ -4,6 +4,7 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mtriple=r600-- -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-promote-alloca,sroa,instcombine < %s | FileCheck -check-prefix=OPT %s target datalayout = "A5" ; OPT-LABEL: @vector_read( diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp --- a/llvm/tools/opt/opt.cpp +++ b/llvm/tools/opt/opt.cpp @@ -462,6 +462,13 @@ // TODO: use a codegen version of PassRegistry.def/PassBuilder::is*Pass() once // it exists. static bool shouldPinPassToLegacyPM(StringRef Pass) { + std::vector PassNameExactToIgnore = { + "amdgpu-simplifylib", "amdgpu-usenative", "amdgpu-promote-alloca", + "amdgpu-promote-alloca-to-vector"}; + for (const auto &P : PassNameExactToIgnore) + if (Pass == P) + return false; + std::vector PassNamePrefix = { "x86-", "xcore-", "wasm-", "systemz-", "ppc-", "nvvm-", "nvptx-", "mips-", "lanai-", "hexagon-", "bpf-", "avr-", "thumb2-", "arm-",