diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -271,6 +271,10 @@ void initializeGCNNSAReassignPass(PassRegistry &); extern char &GCNNSAReassignID; +ModulePass *createAMDGPUDeviceScopeSharedVariablePass(); +void initializeAMDGPUDeviceScopeSharedVariablePass(PassRegistry &); +extern char &AMDGPUDeviceScopeSharedVariableID; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -123,13 +123,15 @@ // should only appear when IPO passes manages to move LDs defined in a kernel // into a single user function. - for (GlobalVariable &GV : M.globals()) { - // TODO: Region address - unsigned AS = GV.getAddressSpace(); - if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) - continue; - - recursivelyVisitUsers(GV, FuncsToAlwaysInline); + if (!AMDGPUTargetMachine::EnableDeviceScopeSharedVariable) { + for (GlobalVariable &GV : M.globals()) { + // TODO: Region address + unsigned AS = GV.getAddressSpace(); + if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) + continue; + + recursivelyVisitUsers(GV, FuncsToAlwaysInline); + } } if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp @@ -0,0 +1,487 @@ +//===-- AMDGPUDeviceScopeSharedVariables.cpp ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// TODO: +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/NoFolder.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/InitializePasses.h" + +#define DEBUG_TYPE "amdgpu-device-scope-shared-variable" + +using namespace llvm; + +namespace { + +class AMDGPUDeviceScopeSharedVariable : public ModulePass { +public: + static char ID; + + AMDGPUDeviceScopeSharedVariable() : ModulePass(ID) { + initializeAMDGPUDeviceScopeSharedVariablePass( + *PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; +}; + +} // namespace + +char AMDGPUDeviceScopeSharedVariable::ID = 0; + +char &llvm::AMDGPUDeviceScopeSharedVariableID = + AMDGPUDeviceScopeSharedVariable::ID; + +ModulePass *llvm::createAMDGPUDeviceScopeSharedVariablePass() { + return new AMDGPUDeviceScopeSharedVariable(); +} + +INITIALIZE_PASS_BEGIN(AMDGPUDeviceScopeSharedVariable, + "implement-amdgpu-device-scope-shared-variable", + "Implement AMDPGU Device Scope Shared Variable", + false /*only look at the cfg*/, false /*analysis pass*/) +INITIALIZE_PASS_DEPENDENCY(AMDGPUAlwaysInline) +INITIALIZE_PASS_DEPENDENCY(SimpleInliner) +INITIALIZE_PASS_END(AMDGPUDeviceScopeSharedVariable, + "implement-amdgpu-device-scope-shared-variable", + "Implement AMDPGU Device Scope Shared Variable", + false /*only look at the cfg*/, false /*analysis pass*/) + +static void createFunctionToLDSMap( + ValueMap &LDSToFunction, + ValueMap> + &FunctionToLDS) { + for (auto it = LDSToFunction.begin(); it != LDSToFunction.end(); ++it) { + const GlobalVariable *LDSGlobal = it->first; + const Function *EnclosingFunction = it->second; + auto rit = FunctionToLDS.find(EnclosingFunction); + if (rit == FunctionToLDS.end()) { + SetVector LDSSet; + LDSSet.insert(LDSGlobal); + FunctionToLDS[EnclosingFunction] = LDSSet; + } else + FunctionToLDS[EnclosingFunction].insert(LDSGlobal); + } +} + +static void pairUpKernelWithLDSList( + const Function *K, + ValueMap> &KernelToCallie, + ValueMap> + &FunctionToLDS, + ValueMap> + &KernelToExtendedLDS) { + // Set which holds all the LDS globals which are defined either directly in + // kernel or indirectly in within callie(s) + SetVector ExtendedLDSSet; + + // Collect all the LDS globals defined within kernel + if (FunctionToLDS.find(K) != FunctionToLDS.end()) + ExtendedLDSSet = FunctionToLDS[K]; + + // Collect all the LDS globals defined within the callie(s) of kernel + SetVector Callies = KernelToCallie[K]; + for (const Function *Callie : Callies) { + if (FunctionToLDS.find(Callie) == FunctionToLDS.end()) + continue; + SetVector CallieLDSList = FunctionToLDS[Callie]; + for (const GlobalVariable *CallieLDS : CallieLDSList) + ExtendedLDSSet.insert(CallieLDS); + } + + KernelToExtendedLDS[K] = ExtendedLDSSet; +} + +static void pairUpKernelWithCallieList( + Module &M, const Function *K, + ValueMap> + &FunctionToLDS, + ValueMap> &KernelToCallie) { + // Get the call graph node associated with current kernel, traverse the call + // graph associated with the it in DFS manner and collect all the associated + // callies which define LDS global(s) + CallGraph CG = CallGraph(M); + const CallGraphNode *KernCGNode = CG[K]; + SmallVector CGNodeStack; + SetVector Visited; + +#ifndef NDEBUG + assert(KernCGNode && "Call graph node associated with kernel definition " + "cannot be null\n"); +#endif + + for (auto it = KernCGNode->begin(); it != KernCGNode->end(); ++it) { + const CallGraphNode *CGN = it->second; +#ifndef NDEBUG + assert(CGN && "Call graph node associated with function definition cannot" + " be null\n"); +#endif + CGNodeStack.push_back(CGN); + } + + SetVector CallieSet; + while (!CGNodeStack.empty()) { + const CallGraphNode *CGNode = CGNodeStack.pop_back_val(); + if (!Visited.insert(CGNode)) + continue; + + Function *F = CGNode->getFunction(); + if (!F || F->isDeclaration()) { +#ifndef NDEBUG + assert(CGNode->empty() && "Call graph node associated with function " + "declaration should not have callie list\n"); +#endif + continue; + } + + auto fit = FunctionToLDS.find(F); + if (fit != FunctionToLDS.end()) + CallieSet.insert(F); + + for (auto it = CGNode->begin(); it != CGNode->end(); ++it) { + const CallGraphNode *CGN = it->second; +#ifndef NDEBUG + assert(CGN && "Call graph node associated with function definition cannot" + " be null\n"); +#endif + CGNodeStack.push_back(CGN); + } + } + + KernelToCallie[K] = CallieSet; +} + +static void pairUpLDSGlobalWithEnclosingFunction( + const GlobalVariable *LDSGlobal, + ValueMap &LDSToFunction) { + // Recursively visit the user list of current LDS global, and find the + // enclosing function where the LDS global is defined, and the enclosing + // function should always be successfully found. + // + // TODO: Is there any other efficient way to find the enclosing functions of + // LDS globals? +#ifndef NDEBUG + assert(!LDSGlobal->user_empty() && + "LDS Global user list cannot be empty since it must have been defined " + "within either kernel or device function"); +#endif + SmallVector UserStack; + SetVector Visited; + + for (const User *U : LDSGlobal->users()) + UserStack.push_back(U); + + while (!UserStack.empty()) { + const User *U = UserStack.pop_back_val(); + if (!Visited.insert(U)) + continue; + + if (const Instruction *I = dyn_cast(U)) { + const Function *F = I->getParent()->getParent(); + if (F) { + LDSToFunction[LDSGlobal] = F; + return; + } + continue; + } + + for (const User *UU : U->users()) + UserStack.push_back(UU); + } +#ifndef NDEBUG + assert(false && "Control is not expected to reach this point"); +#endif +} + +static void +getLDSGlobalSizeInBytes(Module &M, const GlobalVariable *LDSGlobal, + ValueMap &LDSToSize) { + Type *Ty = LDSGlobal->getValueType(); + const DataLayout &DL = M.getDataLayout(); + uint64_t SizeInBytes = DL.getTypeSizeInBits(Ty).getFixedSize() / 8; + LDSToSize[LDSGlobal] = SizeInBytes; +} + +static void replaceIndirectLDSGlobalWithinCallies() {} + +static void replaceDirectLDSGlobalWithinKernel(Module &M, const Function *K, + const GlobalVariable *LDS, + Instruction *PtrInst) { + // Suffix the names of the instructions with unique integer values + static int Suffix = 0; + ++Suffix; + + // Traverse through each `use` of `LDS`, create a proper `ToBeReplacedInst` + // for each `use`, and accordingly replace it. + for (const User *U : LDS->users()) { + Instruction *UserInst = dyn_cast(const_cast(U)); + if (!UserInst) + continue; + + Instruction *ToBeReplacedInst = nullptr; + + if (GetElementPtrInst *GEPInst = dyn_cast(UserInst)) { + // User instruction is GEP instruction, replase it as below + // 1. Extract the last operand of `GEPInst`, say, it is, `Offset` + // 2. Create pointer arithmetic instruction `PtrInst + Offset`, say, it is + // `PtrArithInst` +#ifndef NDEBUG + assert(GEPInst->hasIndices() && "Expected one or more GEP indecies\n"); +#endif + Value *Offset = GEPInst->getOperand(GEPInst->getNumIndices()); + ToBeReplacedInst = GetElementPtrInst::CreateInBounds( + GEPInst->getResultElementType(), PtrInst, Offset, + Twine(PtrInst->getName()) + Twine(".ptr.arith.") + Twine(Suffix), + UserInst); + } else if (LoadInst *LInst = dyn_cast(UserInst)) { + ToBeReplacedInst = new LoadInst(LInst->getType(), PtrInst, + Twine(PtrInst->getName()) + + Twine(".load.") + Twine(Suffix), + UserInst); + } else if (StoreInst *SInst = dyn_cast(UserInst)) { + ToBeReplacedInst = + new StoreInst(SInst->getValueOperand(), PtrInst, UserInst); + } else { + // TODO: Do we need to specially handle any other kind of instructions + // apart from GEP, LOAD, and STORE? +#ifndef NDEBUG + assert(false && "Not implemented\n"); +#endif + } + + // Replace `UserInst` by `ToBeReplacedInst` and erase `UserInst`. +#ifndef NDEBUG + assert(ToBeReplacedInst && "To be replaced instruction cannot be null\n"); +#endif + ToBeReplacedInst->copyMetadata(*UserInst); + UserInst->replaceAllUsesWith(ToBeReplacedInst); + UserInst->eraseFromParent(); + } +} + +static Instruction *insertNecessaryInstructionsWithinKernel( + Module &M, const Function *K, const GlobalVariable *LDS, + const GlobalVariable *NewLDS, const uint64_t Offset) { + // Insert instructions as below at the begining of the entry basic block of + // the kernel + // 1. Insert GEP instruction which access the address `NewLDS + Offset`, say, + // result is `GEPInst` which is of type `char*`. + // 2. Insert type cast instruction which type casts `GEPInst` from `char*` to + // `basetype*` where `basetype` is base type of `LDS`, say the result is, + // `CastInst`. + // 3. Return `CastInst`. + + // Suffix the names of the instructions with unique integer values + static int Suffix = 0; + ++Suffix; + + // Insert gep instruction + BasicBlock::const_iterator iit = K->getEntryBlock().getFirstInsertionPt(); +#ifndef NDEBUG + assert(iit != K->getEntryBlock().end() && + "Entry basic block of the kernel cannot be empty, otherwise control " + "would not reach this point\n"); +#endif + const Instruction &EI = *iit; + Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(M.getContext())), + Constant::getIntegerValue( + Type::getInt64Ty(M.getContext()), APInt(64, Offset))}; + Instruction *GEPInst = GetElementPtrInst::CreateInBounds( + NewLDS->getValueType(), const_cast(NewLDS), Indices, + Twine("dssv.gep.") + Twine(Suffix), const_cast(&EI)); + + // Insert type-cast instruction + // TODO: Do we need to handle any other aggregate type apart from array type? + Type *LDSValueType = LDS->getValueType(); + if (ArrayType *AT = dyn_cast(LDSValueType)) + LDSValueType = AT->getElementType(); + PointerType *ToBeCastedType = + PointerType::get(LDSValueType, AMDGPUAS::LOCAL_ADDRESS); + Instruction *CastInst = new BitCastInst(GEPInst, ToBeCastedType, + Twine("dssv.cast.") + Twine(Suffix), + const_cast(&EI)); + + // Return type-casted instruction + return CastInst; +} + +static bool handleDeviceScopeSharedVariablesForCurKernel( + Module &M, const Function *K, + ValueMap &LDSToFunction, + ValueMap> + &FunctionToLDS, + ValueMap &LDSToSize, + SetVector &CallieList, + SetVector &ExtendedLDSList) { + // Compute the total size of all LDS globals, and also offsets associated with + // them within in the new LDS global which will be created in a moment to + // replace all these LDS globals. + uint64_t TotalLDSSizeInBytes = 0; + ValueMap LDSToOffset; + for (const GlobalVariable *LDS : ExtendedLDSList) { + LDSToOffset[LDS] = TotalLDSSizeInBytes; + TotalLDSSizeInBytes += LDSToSize[LDS]; + } + + // Insert a new LDS global which is nothing but a single contigeous shared + // memory layout representing all the LDS globals associted with the kernel + // which includes those directly defined within the kernel and those + // indirectly defined within callies. + // + // The size of this new contigeous LDS global layout is equal to the sum of + // the sizes of all the associated LDS globals. + // TODO: what about the name of this new LDS global? is it fine or need to be + // changed? + Type *NewLDSTy = + ArrayType::get(IntegerType::get(M.getContext(), 8), TotalLDSSizeInBytes); + GlobalVariable *NewLDS = new GlobalVariable( + M, NewLDSTy, false, GlobalValue::InternalLinkage, + UndefValue::get(NewLDSTy), Twine(K->getName()) + Twine("_LDSLayout"), + nullptr, GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); + NewLDS->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + NewLDS->setAlignment(MaybeAlign(M.getDataLayout().getPreferredAlign(NewLDS))); + + // Now that we have all the necessary information available, next step is to + // properly replace the original LDS globals by their offset counterparts. + for (const GlobalVariable *LDS : ExtendedLDSList) { + uint64_t Offset = LDSToOffset[LDS]; + Instruction *PtrInst = + insertNecessaryInstructionsWithinKernel(M, K, LDS, NewLDS, Offset); + const Function *EnclosingFunction = LDSToFunction[LDS]; + if (EnclosingFunction == K) + replaceDirectLDSGlobalWithinKernel(M, K, LDS, PtrInst); + else + replaceIndirectLDSGlobalWithinCallies(); + } + + return true; +} + +static bool handleDeviceScopeSharedVariables( + Module &M, const SetVector &Kernels, + ValueMap &LDSToFunction, + ValueMap> + &FunctionToLDS, + ValueMap> &KernelToCallie, + ValueMap> + &KernelToExtendedLDS, + ValueMap &LDSToSize) { + // Process LDS global list associated with each kernel + bool Changed = false; + for (const Function *K : Kernels) { + // No LDS globals to process? ignore the kernel, goto next one + if (KernelToExtendedLDS.find(K) == KernelToExtendedLDS.end()) + continue; + Changed |= handleDeviceScopeSharedVariablesForCurKernel( + M, K, LDSToFunction, FunctionToLDS, LDSToSize, KernelToCallie[K], + KernelToExtendedLDS[K]); + } + return Changed; +} + +static bool handleDeviceScopeSharedVariables( + Module &M, const SetVector &LDSGlobals, + const SetVector &Kernels) { + // Pair up each LDS global with the enclosing function where the LDS global is + // defined + ValueMap LDSToFunction; + for (const GlobalVariable *LDSGlobal : LDSGlobals) + pairUpLDSGlobalWithEnclosingFunction(LDSGlobal, LDSToFunction); + + // Create reverse map from enclosing function to LDS global list + ValueMap> FunctionToLDS; + createFunctionToLDSMap(LDSToFunction, FunctionToLDS); + + // Pair up kernels with callie list which define LDS globals + ValueMap> KernelToCallie; + for (const Function *K : Kernels) + pairUpKernelWithCallieList(M, K, FunctionToLDS, KernelToCallie); + + // Pair up kernels with all the associated LDS globals including those + // directly defined within the kernel and those indirectly defined within + // the callies + ValueMap> + KernelToExtendedLDS; + for (const Function *K : Kernels) + pairUpKernelWithLDSList(K, KernelToCallie, FunctionToLDS, + KernelToExtendedLDS); + + // Get the size of each LDS global in bytes + ValueMap LDSToSize; + for (const GlobalVariable *LDSGlobal : LDSGlobals) + getLDSGlobalSizeInBytes(M, LDSGlobal, LDSToSize); + + return handleDeviceScopeSharedVariables(M, Kernels, LDSToFunction, + FunctionToLDS, KernelToCallie, + KernelToExtendedLDS, LDSToSize); +} + +static bool handleDeviceScopeSharedVariables(Module &M) { + // Collect all the (static) LDS globals defined within the current module + SetVector LDSGlobals; + for (GlobalVariable &GV : M.globals()) + if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + GV.hasInternalLinkage()) + LDSGlobals.insert(&GV); + + if (LDSGlobals.empty()) { + LLVM_DEBUG(dbgs() << "No LDS globals defined in the module " << M.getName() + << ", skipping handling device of scope shared variables" + << "\n"); + return false; + } + + // Collect all the amdgpu kernels defined within the current module + SetVector Kernels; + for (Function &F : M.functions()) { + if ((F.getCallingConv() == CallingConv::AMDGPU_KERNEL) && + !F.isDeclaration()) + Kernels.insert(&F); + } + + if (Kernels.empty()) { + LLVM_DEBUG(dbgs() << "No kernels defined in the module " << M.getName() + << ", skipping handling of device scope shared variables" + << "\n"); + return false; + } + + return handleDeviceScopeSharedVariables(M, LDSGlobals, Kernels); +} + +bool AMDGPUDeviceScopeSharedVariable::runOnModule(Module &M) { + LLVM_DEBUG(dbgs() << "===== Handling device scope shared variables in the " + "module " + << M.getName() << "\n"); + + // TODO: We only want to handle HIP kernels, and no kernels from from other + // programming languages, like OpenCL, OpenMP, etc. Do we need to add a + // condition here for it, and skip running the pass for non-HIP kernels? + if (skipModule(M)) { + LLVM_DEBUG(dbgs() << "Skipping handling of device scope shared variables " + "in the module " + << M.getName() << "\n"); + return false; + } + + bool Changed = handleDeviceScopeSharedVariables(M); + + LLVM_DEBUG(dbgs() << "===== Done with hanlding device scope shared variables " + "in the module " + << M.getName() << "\n"); + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -40,6 +40,7 @@ static bool EnableLateStructurizeCFG; static bool EnableFunctionCalls; static bool EnableFixedFunctionABI; + static bool EnableDeviceScopeSharedVariable; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -198,6 +198,12 @@ cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden); +static cl::opt EnableDeviceScopeSharedVariable( + "amdgpu-enable-device-scope-shared-variable", + cl::desc("Support amdgpu device scope shared variables"), + cl::location(AMDGPUTargetMachine::EnableDeviceScopeSharedVariable), + cl::init(false), cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -265,6 +271,7 @@ initializeGCNRegBankReassignPass(*PR); initializeGCNNSAReassignPass(*PR); initializeSIAddIMGInitPass(*PR); + initializeAMDGPUDeviceScopeSharedVariablePass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -394,6 +401,7 @@ bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; bool AMDGPUTargetMachine::EnableFunctionCalls = false; bool AMDGPUTargetMachine::EnableFixedFunctionABI = false; +bool AMDGPUTargetMachine::EnableDeviceScopeSharedVariable = false; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; @@ -771,6 +779,11 @@ // but EarlyCSE can do neither of them. if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses) addEarlyCSEOrGVNPass(); + + // We expect to run this pass as a last IR pass. Hence make sure that this + // pass is added as a last IR pass + if (EnableDeviceScopeSharedVariable) + addPass(createAMDGPUDeviceScopeSharedVariablePass()); } void AMDGPUPassConfig::addCodeGenPrepare() { diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -48,6 +48,7 @@ AMDGPUAtomicOptimizer.cpp AMDGPUCallLowering.cpp AMDGPUCodeGenPrepare.cpp + AMDGPUDeviceScopeSharedVariable.cpp AMDGPUExportClustering.cpp AMDGPUFixFunctionBitcasts.cpp AMDGPUFrameLowering.cpp