diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -73,6 +73,7 @@ FunctionPass *createAMDGPURewriteOutArgumentsPass(); ModulePass *createAMDGPULowerModuleLDSPass(); FunctionPass *createSIModeRegisterPass(); +ModulePass *createAMDGPUSortLDSGlobalsPass(); struct AMDGPUSimplifyLibCallsPass : PassInfoMixin { AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {} @@ -335,6 +336,14 @@ void initializeGCNNSAReassignPass(PassRegistry &); extern char &GCNNSAReassignID; +void initializeAMDGPUSortLDSGlobalsPass(PassRegistry &); +extern char &AMDGPUSortLDSGlobalsID; + +struct AMDGPUSortLDSGlobalsPass : PassInfoMixin { + AMDGPUSortLDSGlobalsPass() {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -9,8 +9,10 @@ #include "AMDGPUMachineFunction.h" #include "AMDGPUPerfHintAnalysis.h" #include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/Target/TargetMachine.h" +#include using namespace llvm; @@ -44,12 +46,38 @@ if (!Entry.second) return Entry.first->second; + // Find the position of `GV` within sorted LDS global list, and it should be + // available. + auto Iter = std::find(AMDGPUTargetMachine::SortedLDSGlobals.begin(), + AMDGPUTargetMachine::SortedLDSGlobals.end(), &GV); + assert(Iter != AMDGPUTargetMachine::SortedLDSGlobals.end() && + "Expected GV to be available within sorted LDS global list"); + + // If required, allocate memory for all the predecessors of `GV` in the sorted + // LDS global list. + // + // FIXME: Factor out common code. + for (auto LI = AMDGPUTargetMachine::SortedLDSGlobals.begin(); LI != Iter; + ++LI) { + auto *LDS = *LI; + + auto PredEntry = LocalMemoryObjects.insert(std::make_pair(LDS, 0)); + if (!PredEntry.second) + continue; + + Align PredAlignment = + DL.getValueOrABITypeAlignment(LDS->getAlign(), LDS->getValueType()); + + unsigned PredOffset = StaticLDSSize = alignTo(StaticLDSSize, PredAlignment); + + PredEntry.first->second = PredOffset; + StaticLDSSize += DL.getTypeAllocSize(LDS->getValueType()); + } + + // Allocate memory for `GV`. Align Alignment = DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType()); - /// TODO: We should sort these to minimize wasted space due to alignment - /// padding. Currently the padding is decided by the first encountered use - /// during lowering. unsigned Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment); Entry.first->second = Offset; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSortLDSGlobals.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSortLDSGlobals.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUSortLDSGlobals.cpp @@ -0,0 +1,101 @@ +//===-- AMDGPUSortLDSGlobals.cpp ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "Utils/AMDGPULDSUtils.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" +#include +#include + +#define DEBUG_TYPE "amdgpu-sort-lds-globals" + +using namespace llvm; + +namespace { + +static void sortLDSGlobals(Module &M) { + // Collect all valid static LDS globals. + std::vector LDSGlobals = AMDGPU::collectStaticLDSGlobals(M); + + const DataLayout &DL = M.getDataLayout(); + + // Sort LDS globals by size, descending, and then, by alignment, descending. + // On ties, sort by name, lexicographical. + llvm::stable_sort( + LDSGlobals, + [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool { + TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType()); + TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType()); + if (SLHS != SRHS) { + return SLHS > SRHS; + } + + Align ALHS = AMDGPU::getAlign(DL, LHS); + Align ARHS = AMDGPU::getAlign(DL, RHS); + if (ALHS != ARHS) { + return ALHS > ARHS; + } + + return LHS->getName() < RHS->getName(); + }); + + // Preserve sorted LDS globals which will be required to use during LDS + // allocation in ISEL pass. + // + // Module LDS which is possibly created by the "Lower Module LDS" pass, should + // be allocated at address 0, irrespective of its size and alignment. + GlobalVariable *ModuleLDS = M.getGlobalVariable("llvm.amdgcn.module.lds"); + if (ModuleLDS) + AMDGPUTargetMachine::SortedLDSGlobals.push_back(ModuleLDS); + + for (auto *LDS : LDSGlobals) { + if (LDS != ModuleLDS) + AMDGPUTargetMachine::SortedLDSGlobals.push_back(LDS); + } +} + +class AMDGPUSortLDSGlobals : public ModulePass { +public: + static char ID; + + AMDGPUSortLDSGlobals() : ModulePass(ID) { + initializeAMDGPUSortLDSGlobalsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; +}; + +} // namespace + +char AMDGPUSortLDSGlobals::ID = 0; +char &llvm::AMDGPUSortLDSGlobalsID = AMDGPUSortLDSGlobals::ID; + +INITIALIZE_PASS(AMDGPUSortLDSGlobals, DEBUG_TYPE, + "Sort LDS globals based on size and alignment", + false /*only look at the cfg*/, false /*analysis pass*/) + +bool AMDGPUSortLDSGlobals::runOnModule(Module &M) { + sortLDSGlobals(M); + return false; +} + +ModulePass *llvm::createAMDGPUSortLDSGlobalsPass() { + return new AMDGPUSortLDSGlobals(); +} + +PreservedAnalyses AMDGPUSortLDSGlobalsPass::run(Module &M, + ModuleAnalysisManager &AM) { + sortLDSGlobals(M); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -36,6 +36,9 @@ static bool EnableFunctionCalls; static bool EnableFixedFunctionABI; static bool EnableLowerModuleLDS; + // FIXME: Ugly programming, find a suitable place for this data structure as + // soon as possible. + static std::vector SortedLDSGlobals; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -198,6 +198,11 @@ cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), cl::Hidden); +static cl::opt + EnableSortLDSGlobals("amdgpu-enable-sort-lds-globals", + cl::desc("Enable sort LDS globals pass"), + cl::init(true), cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -263,6 +268,7 @@ initializeAMDGPUSimplifyLibCallsPass(*PR); initializeAMDGPUPrintfRuntimeBindingPass(*PR); initializeGCNNSAReassignPass(*PR); + initializeAMDGPUSortLDSGlobalsPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -393,6 +399,10 @@ bool AMDGPUTargetMachine::EnableFunctionCalls = false; bool AMDGPUTargetMachine::EnableFixedFunctionABI = false; bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; +// FIXME: Ugly programming, find a suitable place for this data structure as +// soon as possible. +std::vector AMDGPUTargetMachine::SortedLDSGlobals = + std::vector(); AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; @@ -509,6 +519,10 @@ PM.addPass(AMDGPULowerModuleLDSPass()); return true; } + if (PassName == "amdgpu-sort-lds-globals") { + PM.addPass(AMDGPUSortLDSGlobalsPass()); + return true; + } return false; }); PB.registerPipelineParsingCallback( @@ -936,6 +950,10 @@ ? EnableScalarIRPasses : TM.getOptLevel() > CodeGenOpt::Less) addEarlyCSEOrGVNPass(); + + // This pass should always be run as last LLVM IR pass just before ISel pass. + if (EnableSortLDSGlobals) + addPass(createAMDGPUSortLDSGlobalsPass()); } void AMDGPUPassConfig::addCodeGenPrepare() { diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -82,6 +82,7 @@ AMDGPURegBankCombiner.cpp AMDGPURegisterBankInfo.cpp AMDGPURewriteOutArguments.cpp + AMDGPUSortLDSGlobals.cpp AMDGPUSubtarget.cpp AMDGPUTargetMachine.cpp AMDGPUTargetObjectFile.cpp diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h @@ -31,6 +31,8 @@ SmallPtrSet getUsedList(Module &M); +std::vector collectStaticLDSGlobals(Module &M); + } // end namespace AMDGPU } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp @@ -122,6 +122,51 @@ return UsedList; } +std::vector collectStaticLDSGlobals(Module &M) { + std::vector StaticLDSGlobals; + + for (auto &GV : M.globals()) { + if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { + // Skip non LDS global. + continue; + } + + if (GV.hasExternalLinkage()) { + // HIP uses an unsized array `extern __shared__ T s[]` or similar + // zero-sized type in other languages to declare the dynamic shared + // memory variable whose size is not known at the compile time. They will + // be allocated by the runtime and placed directly after the statically + // allocated ones. Skip such dynamic shared memory variables. + assert(GV.getType()->isEmptyTy() && + "Expected dynamic shared memory variable type to be empty"); + continue; + } + + if (!GV.hasInitializer()) { + // Static LDS globals should have "UndefValue" as an initializer. Skip LDS + // globals which do not have an initializer. ISEL pass will catch such + // static LDS globals and report error accordingly. + continue; + } + + if (!isa(GV.getInitializer())) { + // Static LDS globals should have "UndefValue" as an initializer. Skip LDS + // globals which have an initialzer but it is not "UndefValue". ISEL pass + // will catch such static LDS globals and report error accordingly. + continue; + } + + if (GV.isConstant()) { + // A constant LDS global cannot be allocated. Skip it. + continue; + } + + StaticLDSGlobals.push_back(&GV); + } + + return StaticLDSGlobals; +} + } // end namespace AMDGPU } // end namespace llvm