diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -271,6 +271,10 @@ void initializeGCNNSAReassignPass(PassRegistry &); extern char &GCNNSAReassignID; +ModulePass *createAMDGPUDeviceScopeSharedVariablePass(); +void initializeAMDGPUDeviceScopeSharedVariablePass(PassRegistry &); +extern char &AMDGPUDeviceScopeSharedVariableID; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -123,13 +123,15 @@ // should only appear when IPO passes manages to move LDs defined in a kernel // into a single user function. - for (GlobalVariable &GV : M.globals()) { - // TODO: Region address - unsigned AS = GV.getAddressSpace(); - if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) - continue; - - recursivelyVisitUsers(GV, FuncsToAlwaysInline); + if (!AMDGPUTargetMachine::EnableDeviceScopeSharedVariable) { + for (GlobalVariable &GV : M.globals()) { + // TODO: Region address + unsigned AS = GV.getAddressSpace(); + if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) + continue; + + recursivelyVisitUsers(GV, FuncsToAlwaysInline); + } } if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp @@ -0,0 +1,709 @@ +//===-- AMDGPUDeviceScopeSharedVariables.cpp ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// TODO: +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include +#include + +#define DEBUG_TYPE "amdgpu-device-scope-shared-variable" + +using namespace llvm; + +namespace { + +class AMDGPUDeviceScopeSharedVariable : public ModulePass { +public: + static char ID; + + AMDGPUDeviceScopeSharedVariable() : ModulePass(ID) { + initializeAMDGPUDeviceScopeSharedVariablePass( + *PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; +}; + +} // namespace + +char AMDGPUDeviceScopeSharedVariable::ID = 0; + +char &llvm::AMDGPUDeviceScopeSharedVariableID = + AMDGPUDeviceScopeSharedVariable::ID; + +ModulePass *llvm::createAMDGPUDeviceScopeSharedVariablePass() { + return new AMDGPUDeviceScopeSharedVariable(); +} + +INITIALIZE_PASS_BEGIN(AMDGPUDeviceScopeSharedVariable, + "implement-amdgpu-device-scope-shared-variable", + "Implement AMDPGU Device Scope Shared Variable", + false /*only look at the cfg*/, false /*analysis pass*/) +INITIALIZE_PASS_DEPENDENCY(AMDGPUAlwaysInline) +INITIALIZE_PASS_DEPENDENCY(SimpleInliner) +INITIALIZE_PASS_END(AMDGPUDeviceScopeSharedVariable, + "implement-amdgpu-device-scope-shared-variable", + "Implement AMDPGU Device Scope Shared Variable", + false /*only look at the cfg*/, false /*analysis pass*/) + +static void +updateLDSToFunctionMap(ValueMap &LDSToFunction, + ValueMap &OldCallieToNewCallie) { + for (auto it = LDSToFunction.begin(); it != LDSToFunction.end(); ++it) { + GlobalVariable *LDS = it->first; + Function *OldF = it->second; + if (OldCallieToNewCallie.find(OldF) != OldCallieToNewCallie.end()) + LDSToFunction[LDS] = OldCallieToNewCallie[OldF]; + } +} + +static void createFunctionToLDSMap( + ValueMap &LDSToFunction, + ValueMap> &FunctionToLDS) { + for (auto it = LDSToFunction.begin(); it != LDSToFunction.end(); ++it) { + GlobalVariable *LDSGlobal = it->first; + Function *EnclosingFunction = it->second; + auto rit = FunctionToLDS.find(EnclosingFunction); + if (rit == FunctionToLDS.end()) { + SetVector LDSSet; + LDSSet.insert(LDSGlobal); + FunctionToLDS[EnclosingFunction] = LDSSet; + } else + FunctionToLDS[EnclosingFunction].insert(LDSGlobal); + } +} + +static void pairUpKernelWithLDSList( + Function *K, ValueMap> &KernelToCallie, + ValueMap> &FunctionToLDS, + ValueMap> &KernelToDirectLDS, + ValueMap> &KernelToIndirectLDS) { + // If direct LDS globals exist within the kernel, collect it + if (FunctionToLDS.find(K) != FunctionToLDS.end()) + KernelToDirectLDS[K] = FunctionToLDS[K]; + + // Collect all the indirect LDS globals defined within the callie(s) of the + // kernel + SetVector IndirectLDSSet; + SetVector Callies = KernelToCallie[K]; + for (Function *Callie : Callies) { + if (FunctionToLDS.find(Callie) == FunctionToLDS.end()) + continue; + SetVector CallieLDSList = FunctionToLDS[Callie]; + for (GlobalVariable *CallieLDS : CallieLDSList) + IndirectLDSSet.insert(CallieLDS); + } + KernelToIndirectLDS[K] = IndirectLDSSet; +} + +static void pairUpKernelWithCallieList( + Module &M, Function *K, + ValueMap> &FunctionToLDS, + ValueMap> &KernelToCallie) { + // Get the call graph node associated with current kernel, traverse the call + // graph associated with it in DFS manner and collect all the associated + // callies which define LDS global(s) + CallGraph CG = CallGraph(M); + CallGraphNode *KernCGNode = CG[K]; + SmallVector CGNodeStack; + SetVector Visited; + +#ifndef NDEBUG + assert(KernCGNode && "Call graph node associated with kernel definition " + "cannot be null\n"); +#endif + + for (auto it = KernCGNode->begin(); it != KernCGNode->end(); ++it) { + CallGraphNode *CGN = it->second; +#ifndef NDEBUG + assert(CGN && "Call graph node associated with function definition cannot" + " be null\n"); +#endif + CGNodeStack.push_back(CGN); + } + + SetVector CallieSet; + while (!CGNodeStack.empty()) { + CallGraphNode *CGNode = CGNodeStack.pop_back_val(); + if (!Visited.insert(CGNode)) + continue; + + Function *F = CGNode->getFunction(); + if (!F || F->isDeclaration()) { +#ifndef NDEBUG + assert(CGNode->empty() && "Call graph node associated with function " + "declaration should not have callie list\n"); +#endif + continue; + } + + auto fit = FunctionToLDS.find(F); + if (fit != FunctionToLDS.end()) + CallieSet.insert(F); + + for (auto it = CGNode->begin(); it != CGNode->end(); ++it) { + CallGraphNode *CGN = it->second; +#ifndef NDEBUG + assert(CGN && "Call graph node associated with function definition cannot" + " be null\n"); +#endif + CGNodeStack.push_back(CGN); + } + } + + KernelToCallie[K] = CallieSet; +} + +static void pairUpLDSGlobalWithEnclosingFunction( + GlobalVariable *LDSGlobal, + ValueMap &LDSToFunction) { + // Recursively visit the user list of current LDS global, and find the + // enclosing function where the LDS global is defined, and the enclosing + // function should always be successfully found. + // + // TODO: Is there any other efficient way to find the enclosing functions of + // LDS globals? +#ifndef NDEBUG + assert(!LDSGlobal->user_empty() && + "LDS Global user list cannot be empty since it must have been defined " + "within either kernel or device function"); +#endif + SmallVector UserStack; + SetVector Visited; + + for (User *U : LDSGlobal->users()) + UserStack.push_back(U); + + while (!UserStack.empty()) { + User *U = UserStack.pop_back_val(); + if (!Visited.insert(U)) + continue; + + if (Instruction *I = dyn_cast(U)) { + Function *F = I->getParent()->getParent(); + if (F) { + LDSToFunction[LDSGlobal] = F; + return; + } + continue; + } + + for (User *UU : U->users()) + UserStack.push_back(UU); + } +#ifndef NDEBUG + assert(false && "Control is not expected to reach this point"); +#endif +} + +static void +getLDSGlobalSizeInBytes(Module &M, GlobalVariable *LDSGlobal, + ValueMap &LDSToSize) { + Type *Ty = LDSGlobal->getValueType(); + const DataLayout &DL = M.getDataLayout(); + uint64_t SizeInBytes = DL.getTypeSizeInBits(Ty).getFixedSize() / 8; + LDSToSize[LDSGlobal] = SizeInBytes; +} + +static void replaceUseOfDirectLDSGlobalWithinKernel(Module &M, Function *K, + GlobalVariable *LDS, + Instruction *BasePtr) { + // Suffix the names of the instructions with unique integer values + static int Suffix = 0; + ++Suffix; + + // Traverse through each `use` of `LDS`, create a proper `ToBeReplacedInst` + // for each `use`, and accordingly replace it. + for (const User *U : LDS->users()) { + Instruction *UserInst = dyn_cast(const_cast(U)); + if (!UserInst) + continue; + + Instruction *ToBeReplacedInst = nullptr; + + if (GetElementPtrInst *GEPInst = dyn_cast(UserInst)) { + // User instruction is GEP instruction, replace it as below + // 1. Extract the last operand of `GEPInst`, say, it is, `Offset` + // 2. Create pointer arithmetic instruction `BasePtr + Offset` +#ifndef NDEBUG + assert(GEPInst->hasIndices() && "Expected one or more GEP indecies\n"); +#endif + Value *Offset = GEPInst->getOperand(GEPInst->getNumIndices()); + ToBeReplacedInst = GetElementPtrInst::CreateInBounds( + GEPInst->getResultElementType(), BasePtr, Offset, + Twine(BasePtr->getName()) + Twine(".ptr.arith.") + Twine(Suffix), + UserInst); + } else if (LoadInst *LInst = dyn_cast(UserInst)) { + // User instruction is LOAD instruction, replace pointer operand of + // LOAD instruction by `BasePtr` + ToBeReplacedInst = new LoadInst(LInst->getType(), BasePtr, + Twine(BasePtr->getName()) + + Twine(".load.") + Twine(Suffix), + UserInst); + } else if (StoreInst *SInst = dyn_cast(UserInst)) { + // User instruction is STORE instruction, replace pointer operand of + // STORE instruction by `BasePtr` + ToBeReplacedInst = + new StoreInst(SInst->getValueOperand(), BasePtr, UserInst); + } else { + // TODO: Do we need to specially handle any other kind of instructions + // apart from GEP, LOAD, and STORE? +#ifndef NDEBUG + assert(false && "Not implemented\n"); +#endif + } + + // Replace `UserInst` by `ToBeReplacedInst` and erase `UserInst`. +#ifndef NDEBUG + assert(ToBeReplacedInst && "To be replaced instruction cannot be null\n"); +#endif + ToBeReplacedInst->copyMetadata(*UserInst); + UserInst->replaceAllUsesWith(ToBeReplacedInst); + UserInst->eraseFromParent(); + } +} + +static Function *cloneCallie(Module &M, Type *NewParamType, Function *Callie) { + // Create a new function type by adding `NewParamType` to the end of existing + // parameter list. + SmallVector NewParams; + FunctionType *FnTy = Callie->getFunctionType(); + for (auto it = FnTy->param_begin(); it != FnTy->param_end(); ++it) + NewParams.push_back(*it); + NewParams.push_back(NewParamType); + FunctionType *NewFnTy = + FunctionType::get(FnTy->getReturnType(), NewParams, FnTy->isVarArg()); + + // Create a copy of the `Callie`, but with new function type + Function *NewCallie = + Function::Create(NewFnTy, Callie->getLinkage(), Callie->getAddressSpace(), + Callie->getName() + Twine(".c")); + + // TODO: what does this map required for? + ValueToValueMapTy VMap; + auto *NewCallieArgIt = NewCallie->arg_begin(); + for (auto &Arg : Callie->args()) { + auto ArgName = Arg.getName(); + NewCallieArgIt->setName(ArgName); + VMap[&Arg] = &(*NewCallieArgIt++); + } + + // TODO: ModuleLevelChanges should be set to true or false? + SmallVector Returns; + CloneFunctionInto(NewCallie, Callie, VMap, /*ModuleLevelChanges=*/false, + Returns); + + // Copy all metadata + SmallVector, 1> MDs; + Callie->getAllMetadata(MDs); + for (auto MDIt : MDs) + NewCallie->addMetadata(MDIt.first, *MDIt.second); + + // Insert `NewCallie` just before `Callie` within the module. + M.getFunctionList().insert(Callie->getIterator(), NewCallie); + + return NewCallie; +} + +static void CollectCallGraphPathsBetweenKernelAndCallie( + Module &M, Function *K, Function *Callie, + SmallVectorImpl> &CGPaths) { + // Traverse the call graph associated with the kernel in DFS manner and + // collect all the paths from kernel to callie. + // + // TODO: Note that this algorithm will not work if there exist recursive + // calls, and the current assumption here is that the call graph is acyclic. + // We need to visit it back again to handle call graph which could contain + // cycles. + CallGraph CG = CallGraph(M); + CallGraphNode *KernCGNode = CG[K]; +#ifndef NDEBUG + assert(KernCGNode && "Call graph node associated with kernel definition " + "cannot be null\n"); +#endif + + SmallVector, 8> Stack; + SetVector Path; + Path.insert(KernCGNode); + Stack.push_back(Path); + + while (!Stack.empty()) { + SetVector TopPath = Stack.pop_back_val(); + CallGraphNode *CGNode = TopPath.back(); + Function *F = CGNode->getFunction(); + if (F == Callie) { + SetVector FPath; + for (CallGraphNode *CGN : TopPath) + FPath.insert(CGN->getFunction()); + CGPaths.push_back(FPath); + continue; + } + + for (auto it = CGNode->begin(); it != CGNode->end(); ++it) { + CallGraphNode *CGN = it->second; +#ifndef NDEBUG + assert(CGN && "Call graph node associated with function definition cannot" + " be null\n"); +#endif + SetVector ClonedPath(TopPath.begin(), TopPath.end()); + ClonedPath.insert(CGN); + Stack.push_back(ClonedPath); + } + } +} + +static void createCloneOfCalliesWithNewParameter( + Module &M, Function *K, Function *Callie, Instruction *BasePtr, + ValueMap &OldCallieToNewCallie) { + // Update callies to accept the new parameter which is of type same as + // `BasePtr` by creating their clones. Here is the brief sketch of the + // functionality of this function: + // + // 1. Collect all the call graph paths between the kernel and the callie + // 2. Traverse all the call graph paths from kernel to callie. + // 3. For each device function encoutered while traversing, create a clone of + // it, by adding a new parameter of type BasePtr`s type to it's parameter + // list, but also retain the original device function for a moment. + SmallVector, 8> CGPaths; + + CollectCallGraphPathsBetweenKernelAndCallie(M, K, Callie, CGPaths); + + for (auto CGPath : CGPaths) { + // TODO: We can in-fact assert that length of `CGPath` is atleast two. But, + // we are okay for now. + if (CGPath.size() < 2) + continue; + + // We are interested in cloning of only device functions in the call graph + // path, hence we start from second node in the call graph path. + auto it = CGPath.begin() + 1; + for (; it != CGPath.end(); ++it) { + Function *CurCallie = *it; + + if (OldCallieToNewCallie.find(CurCallie) != OldCallieToNewCallie.end()) + continue; + + // The `CurCallie` is encountered first time for the LDS in question, + // Create a clone of it, and save it. + Function *NewCallie = cloneCallie(M, BasePtr->getType(), CurCallie); + OldCallieToNewCallie[CurCallie] = NewCallie; + } + } +} + +static Instruction *insertBasePointerAccessInstructionsWithinKernel( + Module &M, Function *K, GlobalVariable *LDS, GlobalVariable *NewLDS, + uint64_t Offset) { + // Insert instructions as below at the begining of the entry basic block of + // the kernel + // 1. Insert GEP instruction which access the address `NewLDS + Offset`, say, + // result is `GEPInst` which is of type `char*`. + // 2. Insert type cast instruction which type casts `GEPInst` from `char*` to + // `basetype*` where `basetype` is base type of `LDS`, say the result is, + // `CastInst`. + // 3. Return `CastInst`. + + // Suffix the names of the instructions with unique integer values + static int Suffix = 0; + ++Suffix; + + // Insert gep instruction + BasicBlock::const_iterator iit = K->getEntryBlock().getFirstInsertionPt(); +#ifndef NDEBUG + assert(iit != K->getEntryBlock().end() && + "Entry basic block of the kernel cannot be empty, otherwise control " + "would not reach this point\n"); +#endif + const Instruction &EI = *iit; + Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(M.getContext())), + Constant::getIntegerValue( + Type::getInt64Ty(M.getContext()), APInt(64, Offset))}; + Instruction *GEPInst = GetElementPtrInst::CreateInBounds( + NewLDS->getValueType(), const_cast(NewLDS), Indices, + Twine("dssv.gep.") + Twine(Suffix), const_cast(&EI)); + + // Insert type-cast instruction + // TODO: Do we need to handle any other aggregate type apart from array type? + // and, what about 2 and other higher dimensional arrays? + Type *LDSValueType = LDS->getValueType(); + if (ArrayType *AT = dyn_cast(LDSValueType)) + LDSValueType = AT->getElementType(); + PointerType *ToBeCastedType = + PointerType::get(LDSValueType, AMDGPUAS::LOCAL_ADDRESS); + Instruction *CastInst = new BitCastInst(GEPInst, ToBeCastedType, + Twine("dssv.cast.") + Twine(Suffix), + const_cast(&EI)); + + // Return type-casted instruction + return CastInst; +} + +static GlobalVariable * +createSingleContiguousLayout(Module &M, Function *K, + uint64_t &TotalLDSSizeInBytes) { + // Insert a new LDS global which is nothing but a single contigeous shared + // memory layout representing all the LDS globals associted with the kernel + // which includes those directly defined within the kernel and those + // indirectly defined within callies. + // + // The size of this new contigeous LDS global layout is equal to the sum of + // the sizes of all the associated LDS globals. + // TODO: what about the name of this new LDS global? is it fine or need to be + // changed? + Type *NewLDSTy = + ArrayType::get(IntegerType::get(M.getContext(), 8), TotalLDSSizeInBytes); + GlobalVariable *NewLDS = new GlobalVariable( + M, NewLDSTy, false, GlobalValue::InternalLinkage, + UndefValue::get(NewLDSTy), Twine(K->getName()) + Twine("_LDSLayout"), + nullptr, GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); + NewLDS->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + NewLDS->setAlignment(MaybeAlign(M.getDataLayout().getPreferredAlign(NewLDS))); + + return NewLDS; +} + +static void +computeTotalLDSSizeInBytes(ValueMap &LDSToSize, + SetVector &DirectLDSList, + SetVector &IndirectLDSList, + ValueMap &LDSToOffset, + uint64_t &TotalLDSSizeInBytes) { + // Compute the total size of all LDS globals, and also offsets associated with + // them within in the new LDS global. + TotalLDSSizeInBytes = 0; + for (GlobalVariable *LDS : DirectLDSList) { + LDSToOffset[LDS] = TotalLDSSizeInBytes; + TotalLDSSizeInBytes += LDSToSize[LDS]; + } + for (GlobalVariable *LDS : IndirectLDSList) { + LDSToOffset[LDS] = TotalLDSSizeInBytes; + TotalLDSSizeInBytes += LDSToSize[LDS]; + } +} + +static bool handleDeviceScopeSharedVariables( + Module &M, SetVector &Kernels, + ValueMap &LDSToFunction, + ValueMap> &KernelToDirectLDS, + ValueMap> &KernelToIndirectLDS, + ValueMap &LDSToSize) { + // Handle LDS globals associated with each kernel + bool Change = false; + SmallPtrSet ProcessedLDSList; + std::map> + KernelToDirectBasePtrInst; + std::map> + KernelToIndirectBasePtrInst; + + // For each LDS global, insert base pointer access instruction within + // associated kernel(s). + for (Function *K : Kernels) { + // Copy both direct and indirect LDS list for current kernel. + SetVector DirectLDSList; + SetVector IndirectLDSList; + if (KernelToDirectLDS.find(K) != KernelToDirectLDS.end()) + DirectLDSList = KernelToDirectLDS[K]; + if (KernelToIndirectLDS.find(K) != KernelToIndirectLDS.end()) + IndirectLDSList = KernelToIndirectLDS[K]; + + // No LDS globals to process? ignore the kernel, goto next kernel. + if (DirectLDSList.empty() && IndirectLDSList.empty()) + continue; + + // We are going process LDS globals atleast for one kernel. + Change = true; + + // Create a single contigeous LDS latout for current kernel + uint64_t TotalLDSSizeInBytes; + ValueMap LDSToOffset; + computeTotalLDSSizeInBytes(LDSToSize, DirectLDSList, IndirectLDSList, + LDSToOffset, TotalLDSSizeInBytes); + GlobalVariable *NewLDS = + createSingleContiguousLayout(M, K, TotalLDSSizeInBytes); + + // For each LDS global (both direct and indirect ones), insert base pointer + // access instruction within kernel. + std::map DirectLDSToBasePtrInst; + std::map IndirectLDSToBasePtrInst; + for (GlobalVariable *LDS : DirectLDSList) { + Instruction *BasePtr = insertBasePointerAccessInstructionsWithinKernel( + M, K, LDS, NewLDS, LDSToOffset[LDS]); + DirectLDSToBasePtrInst[LDS] = BasePtr; + } + for (GlobalVariable *LDS : IndirectLDSList) { + Instruction *BasePtr = insertBasePointerAccessInstructionsWithinKernel( + M, K, LDS, NewLDS, LDSToOffset[LDS]); + IndirectLDSToBasePtrInst[LDS] = BasePtr; + } + + KernelToDirectBasePtrInst[K] = DirectLDSToBasePtrInst; + KernelToIndirectBasePtrInst[K] = IndirectLDSToBasePtrInst; + } + + // None of the kernel has any LDS globals (direct and/or indirect ones) + // associated with them. Nothing to do, no changes being made to module. + if (!Change) + return Change; + + // Handle indirect LDS globals + for (auto it = KernelToIndirectBasePtrInst.begin(); + it != KernelToIndirectBasePtrInst.end(); ++it) { + Function *K = it->first; + std::map IndirectLDSToBasePtrInst = + it->second; + + // Create clone of all callies (device functions) with new parameters. + for (auto bit = IndirectLDSToBasePtrInst.begin(); + bit != IndirectLDSToBasePtrInst.end(); ++bit) { + GlobalVariable *LDS = bit->first; + Instruction *BasePtr = bit->second; + ValueMap OldCallieToNewCallie; + + if (ProcessedLDSList.find(LDS) != ProcessedLDSList.end()) + continue; + + createCloneOfCalliesWithNewParameter(M, K, LDSToFunction[LDS], BasePtr, + OldCallieToNewCallie); + + updateLDSToFunctionMap(LDSToFunction, OldCallieToNewCallie); + + // TODO: + // 1. update call sites of all device functions to have new argument + // while updating within kernels, make sure that you use base pointer + // associated with that particular (kernel, lds) pair. + // 2. replace uses of old callies at call sites with new callies + // 3. erase old callies from module + + ProcessedLDSList.insert(LDS); + } + } + + // Handle direct LDS globals + for (auto it = KernelToDirectBasePtrInst.begin(); + it != KernelToDirectBasePtrInst.end(); ++it) { + Function *K = it->first; + std::map DirectLDSToBasePtrInst = + it->second; + + for (auto cit = DirectLDSToBasePtrInst.begin(); + cit != DirectLDSToBasePtrInst.end(); ++cit) { + GlobalVariable *LDS = cit->first; + Instruction *BasePtr = cit->second; + replaceUseOfDirectLDSGlobalWithinKernel(M, K, LDS, BasePtr); + } + } + + return Change; +} + +static bool +handleDeviceScopeSharedVariables(Module &M, + SetVector &LDSGlobals, + SetVector &Kernels) { + // Pair up each LDS global with the enclosing function where the LDS global is + // defined + ValueMap LDSToFunction; + for (GlobalVariable *LDSGlobal : LDSGlobals) + pairUpLDSGlobalWithEnclosingFunction(LDSGlobal, LDSToFunction); + + // Create reverse map from enclosing function to LDS global list + ValueMap> FunctionToLDS; + createFunctionToLDSMap(LDSToFunction, FunctionToLDS); + + // Pair up kernels with callie list which define LDS globals + ValueMap> KernelToCallie; + for (Function *K : Kernels) + pairUpKernelWithCallieList(M, K, FunctionToLDS, KernelToCallie); + + // Pair up kernels with all the LDS globals: both direct LDS globals (those + // directly defined within the kernels), and indirect LDS globals (those + // indirectly defined within the callies). + ValueMap> KernelToDirectLDS; + ValueMap> KernelToIndirectLDS; + for (Function *K : Kernels) + pairUpKernelWithLDSList(K, KernelToCallie, FunctionToLDS, KernelToDirectLDS, + KernelToIndirectLDS); + + // Get the size of each LDS global in bytes + ValueMap LDSToSize; + for (GlobalVariable *LDSGlobal : LDSGlobals) + getLDSGlobalSizeInBytes(M, LDSGlobal, LDSToSize); + + return handleDeviceScopeSharedVariables(M, Kernels, LDSToFunction, + KernelToDirectLDS, + KernelToIndirectLDS, LDSToSize); +} + +static bool handleDeviceScopeSharedVariables(Module &M) { + // Collect all the (static) LDS globals defined within the current module + SetVector LDSGlobals; + for (GlobalVariable &GV : M.globals()) + if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + GV.hasInternalLinkage()) + LDSGlobals.insert(&GV); + + if (LDSGlobals.empty()) { + LLVM_DEBUG(dbgs() << "No LDS globals defined in the module " << M.getName() + << ", skipping handling device of scope shared variables" + << "\n"); + return false; + } + + // Collect all the amdgpu kernels defined within the current module + SetVector Kernels; + for (Function &F : M.functions()) { + if ((F.getCallingConv() == CallingConv::AMDGPU_KERNEL) && + !F.isDeclaration()) + Kernels.insert(&F); + } + + if (Kernels.empty()) { + LLVM_DEBUG(dbgs() << "No kernels defined in the module " << M.getName() + << ", skipping handling of device scope shared variables" + << "\n"); + return false; + } + + return handleDeviceScopeSharedVariables(M, LDSGlobals, Kernels); +} + +bool AMDGPUDeviceScopeSharedVariable::runOnModule(Module &M) { + LLVM_DEBUG(dbgs() << "===== Handling device scope shared variables in the " + "module " + << M.getName() << "\n"); + + // TODO: We only want to handle HIP kernels, and no kernels from from other + // programming languages, like OpenCL, OpenMP, etc. Do we need to add a + // condition here for it, and skip running the pass for non-HIP kernels? + if (skipModule(M)) { + LLVM_DEBUG(dbgs() << "Skipping handling of device scope shared variables " + "in the module " + << M.getName() << "\n"); + return false; + } + + bool Changed = handleDeviceScopeSharedVariables(M); + + LLVM_DEBUG(dbgs() << "===== Done with hanlding device scope shared variables " + "in the module " + << M.getName() << "\n"); + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -40,6 +40,7 @@ static bool EnableLateStructurizeCFG; static bool EnableFunctionCalls; static bool EnableFixedFunctionABI; + static bool EnableDeviceScopeSharedVariable; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -198,6 +198,12 @@ cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden); +static cl::opt EnableDeviceScopeSharedVariable( + "amdgpu-enable-device-scope-shared-variable", + cl::desc("Support amdgpu device scope shared variables"), + cl::location(AMDGPUTargetMachine::EnableDeviceScopeSharedVariable), + cl::init(false), cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -265,6 +271,7 @@ initializeGCNRegBankReassignPass(*PR); initializeGCNNSAReassignPass(*PR); initializeSIAddIMGInitPass(*PR); + initializeAMDGPUDeviceScopeSharedVariablePass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -394,6 +401,7 @@ bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; bool AMDGPUTargetMachine::EnableFunctionCalls = false; bool AMDGPUTargetMachine::EnableFixedFunctionABI = false; +bool AMDGPUTargetMachine::EnableDeviceScopeSharedVariable = false; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; @@ -771,6 +779,11 @@ // but EarlyCSE can do neither of them. if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses) addEarlyCSEOrGVNPass(); + + // We expect to run this pass as a last IR pass. Hence make sure that this + // pass is added as a last IR pass + if (EnableDeviceScopeSharedVariable) + addPass(createAMDGPUDeviceScopeSharedVariablePass()); } void AMDGPUPassConfig::addCodeGenPrepare() { diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -48,6 +48,7 @@ AMDGPUAtomicOptimizer.cpp AMDGPUCallLowering.cpp AMDGPUCodeGenPrepare.cpp + AMDGPUDeviceScopeSharedVariable.cpp AMDGPUExportClustering.cpp AMDGPUFixFunctionBitcasts.cpp AMDGPUFrameLowering.cpp