diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -271,6 +271,10 @@ void initializeGCNNSAReassignPass(PassRegistry &); extern char &GCNNSAReassignID; +ModulePass *createAMDGPUDeviceScopeSharedVariablePass(); +void initializeAMDGPUDeviceScopeSharedVariablePass(PassRegistry &); +extern char &AMDGPUDeviceScopeSharedVariableID; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -123,13 +123,15 @@ // should only appear when IPO passes manages to move LDs defined in a kernel // into a single user function. - for (GlobalVariable &GV : M.globals()) { - // TODO: Region address - unsigned AS = GV.getAddressSpace(); - if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) - continue; - - recursivelyVisitUsers(GV, FuncsToAlwaysInline); + if (!AMDGPUTargetMachine::EnableDeviceScopeSharedVariable) { + for (GlobalVariable &GV : M.globals()) { + // TODO: Region address + unsigned AS = GV.getAddressSpace(); + if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) + continue; + + recursivelyVisitUsers(GV, FuncsToAlwaysInline); + } } if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp @@ -0,0 +1,742 @@ +//===-- AMDGPUDeviceScopeSharedVariables.cpp ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// TODO: +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include + +#define DEBUG_TYPE "amdgpu-device-scope-shared-variable" + +using namespace llvm; + +namespace { + +class AMDGPUDeviceScopeSharedVariable : public ModulePass { +public: + static char ID; + + AMDGPUDeviceScopeSharedVariable() : ModulePass(ID) { + initializeAMDGPUDeviceScopeSharedVariablePass( + *PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; +}; + +} // namespace + +char AMDGPUDeviceScopeSharedVariable::ID = 0; + +char &llvm::AMDGPUDeviceScopeSharedVariableID = + AMDGPUDeviceScopeSharedVariable::ID; + +ModulePass *llvm::createAMDGPUDeviceScopeSharedVariablePass() { + return new AMDGPUDeviceScopeSharedVariable(); +} + +INITIALIZE_PASS_BEGIN(AMDGPUDeviceScopeSharedVariable, + "implement-amdgpu-device-scope-shared-variable", + "Implement AMDPGU Device Scope Shared Variable", + false /*only look at the cfg*/, false /*analysis pass*/) +INITIALIZE_PASS_DEPENDENCY(AMDGPUAlwaysInline) +INITIALIZE_PASS_DEPENDENCY(SimpleInliner) +INITIALIZE_PASS_END(AMDGPUDeviceScopeSharedVariable, + "implement-amdgpu-device-scope-shared-variable", + "Implement AMDPGU Device Scope Shared Variable", + false /*only look at the cfg*/, false /*analysis pass*/) + +static void createFunctionToLDSMap( + ValueMap &LDSToFunction, + ValueMap> &FunctionToLDS) { + for (auto it = LDSToFunction.begin(); it != LDSToFunction.end(); ++it) { + GlobalVariable *LDSGlobal = it->first; + Function *EnclosingFunction = it->second; + auto rit = FunctionToLDS.find(EnclosingFunction); + if (rit == FunctionToLDS.end()) { + SetVector LDSSet; + LDSSet.insert(LDSGlobal); + FunctionToLDS[EnclosingFunction] = LDSSet; + } else + FunctionToLDS[EnclosingFunction].insert(LDSGlobal); + } +} + +static void pairUpKernelWithLDSList( + Function *K, ValueMap> &KernelToCallie, + ValueMap> &FunctionToLDS, + ValueMap> &KernelToDirectLDS, + ValueMap> &KernelToIndirectLDS) { + // If direct LDS globals exist within the kernel, collect it + if (FunctionToLDS.find(K) != FunctionToLDS.end()) + KernelToDirectLDS[K] = FunctionToLDS[K]; + + // Collect all the indirect LDS globals defined within the callie(s) of the + // kernel + SetVector IndirectLDSSet; + SetVector Callies = KernelToCallie[K]; + for (Function *Callie : Callies) { + if (FunctionToLDS.find(Callie) == FunctionToLDS.end()) + continue; + SetVector CallieLDSList = FunctionToLDS[Callie]; + for (GlobalVariable *CallieLDS : CallieLDSList) + IndirectLDSSet.insert(CallieLDS); + } + KernelToIndirectLDS[K] = IndirectLDSSet; +} + +static void pairUpKernelWithCallieList( + Module &M, Function *K, + ValueMap> &FunctionToLDS, + ValueMap> &KernelToCallie) { + // Get the call graph node associated with current kernel, traverse the call + // graph associated with it in DFS manner and collect all the associated + // callies which define LDS global(s) + CallGraph CG = CallGraph(M); + CallGraphNode *KernCGNode = CG[K]; + SmallVector CGNodeStack; + SetVector Visited; + +#ifndef NDEBUG + assert(KernCGNode && "Call graph node associated with kernel definition " + "cannot be null\n"); +#endif + + for (auto it = KernCGNode->begin(); it != KernCGNode->end(); ++it) { + CallGraphNode *CGN = it->second; +#ifndef NDEBUG + assert(CGN && "Call graph node associated with function definition cannot" + " be null\n"); +#endif + CGNodeStack.push_back(CGN); + } + + SetVector CallieSet; + while (!CGNodeStack.empty()) { + CallGraphNode *CGNode = CGNodeStack.pop_back_val(); + if (!Visited.insert(CGNode)) + continue; + + Function *F = CGNode->getFunction(); + if (!F || F->isDeclaration()) { +#ifndef NDEBUG + assert(CGNode->empty() && "Call graph node associated with function " + "declaration should not have callie list\n"); +#endif + continue; + } + + auto fit = FunctionToLDS.find(F); + if (fit != FunctionToLDS.end()) + CallieSet.insert(F); + + for (auto it = CGNode->begin(); it != CGNode->end(); ++it) { + CallGraphNode *CGN = it->second; +#ifndef NDEBUG + assert(CGN && "Call graph node associated with function definition cannot" + " be null\n"); +#endif + CGNodeStack.push_back(CGN); + } + } + + KernelToCallie[K] = CallieSet; +} + +static void pairUpLDSGlobalWithEnclosingFunction( + GlobalVariable *LDSGlobal, + ValueMap &LDSToFunction) { + // Recursively visit the user list of current LDS global, and find the + // enclosing function where the LDS global is defined, and the enclosing + // function should always be successfully found. + // + // TODO: Is there any other efficient way to find the enclosing functions of + // LDS globals? +#ifndef NDEBUG + assert(!LDSGlobal->user_empty() && + "LDS Global user list cannot be empty since it must have been defined " + "within either kernel or device function"); +#endif + SmallVector UserStack; + SetVector Visited; + + for (User *U : LDSGlobal->users()) + UserStack.push_back(U); + + while (!UserStack.empty()) { + User *U = UserStack.pop_back_val(); + if (!Visited.insert(U)) + continue; + + if (Instruction *I = dyn_cast(U)) { + Function *F = I->getParent()->getParent(); + if (F) { + LDSToFunction[LDSGlobal] = F; + return; + } + continue; + } + + for (User *UU : U->users()) + UserStack.push_back(UU); + } +#ifndef NDEBUG + assert(false && "Control is not expected to reach this point"); +#endif +} + +static void +getLDSGlobalSizeInBytes(Module &M, GlobalVariable *LDSGlobal, + ValueMap &LDSToSize) { + Type *Ty = LDSGlobal->getValueType(); + const DataLayout &DL = M.getDataLayout(); + uint64_t SizeInBytes = DL.getTypeSizeInBits(Ty).getFixedSize() / 8; + LDSToSize[LDSGlobal] = SizeInBytes; +} + +static void handleDirectLDSGlobalWithinKernel(Module &M, Function *K, + GlobalVariable *LDS, + Instruction *BasePtr) { + // Suffix the names of the instructions with unique integer values + static int Suffix = 0; + ++Suffix; + + // Traverse through each `use` of `LDS`, create a proper `ToBeReplacedInst` + // for each `use`, and accordingly replace it. + for (const User *U : LDS->users()) { + Instruction *UserInst = dyn_cast(const_cast(U)); + if (!UserInst) + continue; + + Instruction *ToBeReplacedInst = nullptr; + + if (GetElementPtrInst *GEPInst = dyn_cast(UserInst)) { + // User instruction is GEP instruction, replace it as below + // 1. Extract the last operand of `GEPInst`, say, it is, `Offset` + // 2. Create pointer arithmetic instruction `BasePtr + Offset` +#ifndef NDEBUG + assert(GEPInst->hasIndices() && "Expected one or more GEP indecies\n"); +#endif + Value *Offset = GEPInst->getOperand(GEPInst->getNumIndices()); + ToBeReplacedInst = GetElementPtrInst::CreateInBounds( + GEPInst->getResultElementType(), BasePtr, Offset, + Twine(BasePtr->getName()) + Twine(".ptr.arith.") + Twine(Suffix), + UserInst); + } else if (LoadInst *LInst = dyn_cast(UserInst)) { + // User instruction is LOAD instruction, replace pointer operand of + // LOAD instruction by `BasePtr` + ToBeReplacedInst = new LoadInst(LInst->getType(), BasePtr, + Twine(BasePtr->getName()) + + Twine(".load.") + Twine(Suffix), + UserInst); + } else if (StoreInst *SInst = dyn_cast(UserInst)) { + // User instruction is STORE instruction, replace pointer operand of + // STORE instruction by `BasePtr` + ToBeReplacedInst = + new StoreInst(SInst->getValueOperand(), BasePtr, UserInst); + } else { + // TODO: Do we need to specially handle any other kind of instructions + // apart from GEP, LOAD, and STORE? +#ifndef NDEBUG + assert(false && "Not implemented\n"); +#endif + } + + // Replace `UserInst` by `ToBeReplacedInst` and erase `UserInst`. +#ifndef NDEBUG + assert(ToBeReplacedInst && "To be replaced instruction cannot be null\n"); +#endif + ToBeReplacedInst->copyMetadata(*UserInst); + UserInst->replaceAllUsesWith(ToBeReplacedInst); + UserInst->eraseFromParent(); + } +} + +static void +getClonedArgumentList(Function *K, Function *CurCaller, Instruction *BasePtr, + CallInst *CI, + ValueMap &OldCallieToNewCallie, + SmallVectorImpl &NewArgs) { + for (auto it = CI->arg_begin(); it != CI->arg_end(); ++it) + NewArgs.push_back(*it); + + Value *NewArg = nullptr; + if (CurCaller != K) { + Function *NewCurCaller = OldCallieToNewCallie[CurCaller]; +#ifndef NDEBUG + assert(NewCurCaller && "Proper new caller should exist\n"); +#endif + + NewArg = NewCurCaller->getArg(NewCurCaller->arg_size() - 1); +#ifndef NDEBUG + assert(NewArg && "Proper new parameter within new caller should exist\n"); +#endif + } else + NewArg = BasePtr; + + NewArgs.push_back(NewArg); +} + +static CallInst * +getCallInstruction(Optional &O, Function *K, + Function *CurCaller, + ValueMap &OldCallieToNewCallie) { +#ifndef NDEBUG + assert(O.hasValue() && "Valid call instruction should exist\n"); +#endif + + CallInst *CI = dyn_cast(O.getValue()); +#ifndef NDEBUG + assert(CI && "Valid call instruction should exist\n"); + assert(CI->getParent()->getParent() == CurCaller && "Not a valid caller\n"); +#endif + + // At this point, `CI` is a call instruction from `CurCaller`. If the + // `CurCaller` is kernel `K` itself, then return the same call instruction. + // Otherwise, we need to find the replica of `CI` within the new clone of + // `CurCaller`, and return it. + if (CurCaller == K) + return CI; + + // Find replica of `CI` within the new clone of `CurCaller`, and return it. + // TODO: Yet to implement it + // Function *NewCaller = OldCallieToNewCallie[CurCaller]; + return nullptr; +} + +static void +updateCurrentCaller(Module &M, Function *K, Instruction *BasePtr, + CallGraphNode *CurCallerCGNode, Function *CurCallie, + ValueMap &OldCallieToNewCallie) { + // Update the current caller to have a new call instruction(s) to new callie. + Function *CurCaller = CurCallerCGNode->getFunction(); + for (auto it = CurCallerCGNode->begin(); it != CurCallerCGNode->end(); ++it) { + Function *Callie = it->second->getFunction(); + + // Not the callie in question? ignore it. + if (Callie != CurCallie) + continue; + + // We have found a call site which has a call to `CurCallie`. Get the call + // instruction within the current caller which calls the current callie. + // Note: If the caller is a device function, then we should get the call + // instruction from the new clone of it, `not` from the original caller. + CallInst *CI = + getCallInstruction(it->first, K, CurCaller, OldCallieToNewCallie); + + // Get the new argument list which is required to insert new call + // instruction + SmallVector NewArgs; + getClonedArgumentList(K, CurCaller, BasePtr, CI, OldCallieToNewCallie, + NewArgs); + + // Insert new call instruction `NewCI` just before the existing call + // instruction `CI`. + Function *NewCallie = OldCallieToNewCallie[CurCallie]; +#ifndef NDEBUG + assert(NewCallie && "Valid new callie should exist\n"); +#endif + CallInst *NewCI = CallInst::Create(NewCallie->getFunctionType(), NewCallie, + NewArgs, Twine("hsm-call"), CI); + NewCI->copyMetadata(*CI); + } +} + +static Function *CloneFunction(Function *F, Type *BasePtrType) { + // Create a new function type by inserting `BasePtr` argument type to + // existing arg list + SmallVector NewParams; + FunctionType *FnTy = F->getFunctionType(); + for (auto it = FnTy->param_begin(); it != FnTy->param_end(); ++it) + NewParams.push_back(*it); + NewParams.push_back(BasePtrType); + FunctionType *NewFnTy = + FunctionType::get(FnTy->getReturnType(), NewParams, FnTy->isVarArg()); + + // Create a copy of the current function with new function type + Function *NewF = Function::Create(NewFnTy, F->getLinkage(), + F->getAddressSpace(), F->getName()); + ValueToValueMapTy VMap; + auto *NewFArgIt = NewF->arg_begin(); + for (auto &Arg : F->args()) { + auto ArgName = Arg.getName(); + NewFArgIt->setName(ArgName); + VMap[&Arg] = &(*NewFArgIt++); + } + // TODO: ModuleLevelChanges should be set to true or false? + SmallVector Returns; + CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns); + + // Copy all metadata + SmallVector, 1> MDs; + F->getAllMetadata(MDs); + for (auto MDIt : MDs) + NewF->addMetadata(MDIt.first, *MDIt.second); + + return NewF; +} + +static void createCloneOfCalliesWithNewParameter( + Module &M, Function *K, Function *Callie, Instruction *BasePtr, + SmallVectorImpl> &CGPaths, + ValueMap &OldCallieToNewCallie) { + for (auto CGPath : CGPaths) { + // TODO: We can in-fact assert that length of `CGPath` is atleast two. But, + // we are okay for now. + if (CGPath.size() < 2) + continue; + + // First function in the call graph path is always is kernel + CallGraphNode *CurCallerCGNode = *CGPath.begin(); +#ifndef NDEBUG + assert(CurCallerCGNode->getFunction() == K && + "Should be kernel function\n"); +#endif + + // We are interested in cloning of only device functions in the call graph + // path, hence we start from second node in the call graph path. + auto it = CGPath.begin() + 1; + for (; it != CGPath.end(); ++it) { + CallGraphNode *CurCallieCGNode = *it; + Function *CurCallie = CurCallieCGNode->getFunction(); + + // The `CurCallie` is encountered first time, then, create a clone of it. + if (OldCallieToNewCallie.find(CurCallie) == OldCallieToNewCallie.end()) { + // 1. Create a clone of the `OldCallie` by cloning it as `NewCallie`. + // This new clone will have new parameter of type same as `BasePtr` + // type. + // 2. Insert `NewCallie` into module just before the `OldCallie` but + // keep the `OldCallie` until all set to remove it from the module. + Function *NewCallie = CloneFunction(CurCallie, BasePtr->getType()); + M.getFunctionList().insert(CurCallie->getIterator(), NewCallie); + OldCallieToNewCallie[CurCallie] = NewCallie; + } + + // Update the current caller to have a new call instruction to new callie + updateCurrentCaller(M, K, BasePtr, CurCallerCGNode, CurCallie, + OldCallieToNewCallie); + + // Current callie becomes next caller in the call graph path + CurCallerCGNode = CurCallieCGNode; + } + } +} + +static void CollectCallGraphPathsBetweenKernelAndCallie( + Module &M, Function *K, Function *Callie, + SmallVectorImpl> &CGPaths) { + // Traverse the call graph associated with the kernel in DFS manner and + // collect all the paths from kernel to callie. + // + // TODO: Note that this algorithm will not work if there exist recursive + // calls, and the current assumption here is that the call graph is acyclic. + // We need to visit it back again to handle call graph which could contain + // cycles. + CallGraph CG = CallGraph(M); + CallGraphNode *KernCGNode = CG[K]; +#ifndef NDEBUG + assert(KernCGNode && "Call graph node associated with kernel definition " + "cannot be null\n"); +#endif + + SmallVector, 16> Stack; + SetVector Path; + Path.insert(KernCGNode); + Stack.push_back(Path); + + while (!Stack.empty()) { + SetVector TopPath = Stack.pop_back_val(); + CallGraphNode *CGNode = TopPath.back(); + Function *F = CGNode->getFunction(); + if (F == Callie) { + CGPaths.push_back(TopPath); + continue; + } + + for (auto it = CGNode->begin(); it != CGNode->end(); ++it) { + CallGraphNode *CGN = it->second; +#ifndef NDEBUG + assert(CGN && "Call graph node associated with function definition cannot" + " be null\n"); +#endif + SetVector ClonedPath(TopPath.begin(), TopPath.end()); + ClonedPath.insert(CGN); + Stack.push_back(ClonedPath); + } + } +} + +static void handleIndirectLDSGlobalWithinCallie( + Module &M, Function *K, Function *Callie, Instruction *BasePtr, + ValueMap &OldCallieToNewCallie) { + // Collect all the call graph paths between the kernel and the callie + SmallVector, 8> CGPaths; + CollectCallGraphPathsBetweenKernelAndCallie(M, K, Callie, CGPaths); + + // Update callies to accept the new parameter which is of type same as + // `BasePtr` by creating their clones + createCloneOfCalliesWithNewParameter(M, K, Callie, BasePtr, CGPaths, + OldCallieToNewCallie); +} + +static Instruction *insertBasePointerAccessInstructionsWithinKernel( + Module &M, Function *K, GlobalVariable *LDS, GlobalVariable *NewLDS, + uint64_t Offset) { + // Insert instructions as below at the begining of the entry basic block of + // the kernel + // 1. Insert GEP instruction which access the address `NewLDS + Offset`, say, + // result is `GEPInst` which is of type `char*`. + // 2. Insert type cast instruction which type casts `GEPInst` from `char*` to + // `basetype*` where `basetype` is base type of `LDS`, say the result is, + // `CastInst`. + // 3. Return `CastInst`. + + // Suffix the names of the instructions with unique integer values + static int Suffix = 0; + ++Suffix; + + // Insert gep instruction + BasicBlock::const_iterator iit = K->getEntryBlock().getFirstInsertionPt(); +#ifndef NDEBUG + assert(iit != K->getEntryBlock().end() && + "Entry basic block of the kernel cannot be empty, otherwise control " + "would not reach this point\n"); +#endif + const Instruction &EI = *iit; + Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(M.getContext())), + Constant::getIntegerValue( + Type::getInt64Ty(M.getContext()), APInt(64, Offset))}; + Instruction *GEPInst = GetElementPtrInst::CreateInBounds( + NewLDS->getValueType(), const_cast(NewLDS), Indices, + Twine("dssv.gep.") + Twine(Suffix), const_cast(&EI)); + + // Insert type-cast instruction + // TODO: Do we need to handle any other aggregate type apart from array type? + // and, what about 2 and other higher dimensional arrays? + Type *LDSValueType = LDS->getValueType(); + if (ArrayType *AT = dyn_cast(LDSValueType)) + LDSValueType = AT->getElementType(); + PointerType *ToBeCastedType = + PointerType::get(LDSValueType, AMDGPUAS::LOCAL_ADDRESS); + Instruction *CastInst = new BitCastInst(GEPInst, ToBeCastedType, + Twine("dssv.cast.") + Twine(Suffix), + const_cast(&EI)); + + // Return type-casted instruction + return CastInst; +} + +static bool handleDeviceScopeSharedVariablesForCurKernel( + Module &M, Function *K, + ValueMap &LDSToFunction, + ValueMap &LDSToSize, + SetVector &DirectLDSList, + SetVector &IndirectLDSList, + ValueMap &OldCallieToNewCallie) { + // Compute the total size of all LDS globals, and also offsets associated with + // them within in the new LDS global which will be created in a moment to + // replace all these LDS globals. + uint64_t TotalLDSSizeInBytes = 0; + ValueMap LDSToOffset; + for (GlobalVariable *LDS : DirectLDSList) { + LDSToOffset[LDS] = TotalLDSSizeInBytes; + TotalLDSSizeInBytes += LDSToSize[LDS]; + } + for (GlobalVariable *LDS : IndirectLDSList) { + LDSToOffset[LDS] = TotalLDSSizeInBytes; + TotalLDSSizeInBytes += LDSToSize[LDS]; + } + + // Insert a new LDS global which is nothing but a single contigeous shared + // memory layout representing all the LDS globals associted with the kernel + // which includes those directly defined within the kernel and those + // indirectly defined within callies. + // + // The size of this new contigeous LDS global layout is equal to the sum of + // the sizes of all the associated LDS globals. + // TODO: what about the name of this new LDS global? is it fine or need to be + // changed? + Type *NewLDSTy = + ArrayType::get(IntegerType::get(M.getContext(), 8), TotalLDSSizeInBytes); + GlobalVariable *NewLDS = new GlobalVariable( + M, NewLDSTy, false, GlobalValue::InternalLinkage, + UndefValue::get(NewLDSTy), Twine(K->getName()) + Twine("_LDSLayout"), + nullptr, GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); + NewLDS->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + NewLDS->setAlignment(MaybeAlign(M.getDataLayout().getPreferredAlign(NewLDS))); + + // Now that we have all the necessary information available, next step is to + // properly replace the original LDS globals by their offset counterparts. + // + // First, for each LDS global (both direct and indirect ones), insert base + // pointer access instruction within kernel. + ValueMap DirectLDSToBasePtrInst; + ValueMap IndirectLDSToBasePtrInst; + for (GlobalVariable *LDS : DirectLDSList) { + Instruction *BasePtr = insertBasePointerAccessInstructionsWithinKernel( + M, K, LDS, NewLDS, LDSToOffset[LDS]); + DirectLDSToBasePtrInst[LDS] = BasePtr; + } + for (GlobalVariable *LDS : IndirectLDSList) { + Instruction *BasePtr = insertBasePointerAccessInstructionsWithinKernel( + M, K, LDS, NewLDS, LDSToOffset[LDS]); + IndirectLDSToBasePtrInst[LDS] = BasePtr; + } + + // Next, handle all the indirect globals associated with the current kernel. + for (auto it = IndirectLDSToBasePtrInst.begin(); + it != IndirectLDSToBasePtrInst.end(); ++it) { + handleIndirectLDSGlobalWithinCallie(M, K, LDSToFunction[it->first], + it->second, OldCallieToNewCallie); + } + + // Finally, handle all the direct globals associated with the current kernel. + for (auto it = DirectLDSToBasePtrInst.begin(); + it != DirectLDSToBasePtrInst.end(); ++it) { + handleDirectLDSGlobalWithinKernel(M, K, it->first, it->second); + } + + return true; +} + +static bool handleDeviceScopeSharedVariables( + Module &M, SetVector &Kernels, + ValueMap &LDSToFunction, + ValueMap> &KernelToDirectLDS, + ValueMap> &KernelToIndirectLDS, + ValueMap &LDSToSize) { + bool Changed = false; + + // Process LDS globals associated with each kernel + ValueMap OldCallieToNewCallie; + for (Function *K : Kernels) { + SetVector DirectLDSList; + if (KernelToDirectLDS.find(K) != KernelToDirectLDS.end()) + DirectLDSList = KernelToDirectLDS[K]; + + SetVector IndirectLDSList; + if (KernelToIndirectLDS.find(K) != KernelToIndirectLDS.end()) + IndirectLDSList = KernelToIndirectLDS[K]; + + // No LDS globals to process? ignore the kernel, goto next one + if (DirectLDSList.empty() && IndirectLDSList.empty()) + continue; + + // Process LDS globals + Changed |= handleDeviceScopeSharedVariablesForCurKernel( + M, K, LDSToFunction, LDSToSize, DirectLDSList, IndirectLDSList, + OldCallieToNewCallie); + } + + return Changed; +} + +static bool +handleDeviceScopeSharedVariables(Module &M, + SetVector &LDSGlobals, + SetVector &Kernels) { + // Pair up each LDS global with the enclosing function where the LDS global is + // defined + ValueMap LDSToFunction; + for (GlobalVariable *LDSGlobal : LDSGlobals) + pairUpLDSGlobalWithEnclosingFunction(LDSGlobal, LDSToFunction); + + // Create reverse map from enclosing function to LDS global list + ValueMap> FunctionToLDS; + createFunctionToLDSMap(LDSToFunction, FunctionToLDS); + + // Pair up kernels with callie list which define LDS globals + ValueMap> KernelToCallie; + for (Function *K : Kernels) + pairUpKernelWithCallieList(M, K, FunctionToLDS, KernelToCallie); + + // Pair up kernels with all the LDS globals: both direct LDS globals (those + // directly defined within the kernels), and indirect LDS globals (those + // indirectly defined within the callies). + ValueMap> KernelToDirectLDS; + ValueMap> KernelToIndirectLDS; + for (Function *K : Kernels) + pairUpKernelWithLDSList(K, KernelToCallie, FunctionToLDS, KernelToDirectLDS, + KernelToIndirectLDS); + + // Get the size of each LDS global in bytes + ValueMap LDSToSize; + for (GlobalVariable *LDSGlobal : LDSGlobals) + getLDSGlobalSizeInBytes(M, LDSGlobal, LDSToSize); + + return handleDeviceScopeSharedVariables(M, Kernels, LDSToFunction, + KernelToDirectLDS, + KernelToIndirectLDS, LDSToSize); +} + +static bool handleDeviceScopeSharedVariables(Module &M) { + // Collect all the (static) LDS globals defined within the current module + SetVector LDSGlobals; + for (GlobalVariable &GV : M.globals()) + if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + GV.hasInternalLinkage()) + LDSGlobals.insert(&GV); + + if (LDSGlobals.empty()) { + LLVM_DEBUG(dbgs() << "No LDS globals defined in the module " << M.getName() + << ", skipping handling device of scope shared variables" + << "\n"); + return false; + } + + // Collect all the amdgpu kernels defined within the current module + SetVector Kernels; + for (Function &F : M.functions()) { + if ((F.getCallingConv() == CallingConv::AMDGPU_KERNEL) && + !F.isDeclaration()) + Kernels.insert(&F); + } + + if (Kernels.empty()) { + LLVM_DEBUG(dbgs() << "No kernels defined in the module " << M.getName() + << ", skipping handling of device scope shared variables" + << "\n"); + return false; + } + + return handleDeviceScopeSharedVariables(M, LDSGlobals, Kernels); +} + +bool AMDGPUDeviceScopeSharedVariable::runOnModule(Module &M) { + LLVM_DEBUG(dbgs() << "===== Handling device scope shared variables in the " + "module " + << M.getName() << "\n"); + + // TODO: We only want to handle HIP kernels, and no kernels from from other + // programming languages, like OpenCL, OpenMP, etc. Do we need to add a + // condition here for it, and skip running the pass for non-HIP kernels? + if (skipModule(M)) { + LLVM_DEBUG(dbgs() << "Skipping handling of device scope shared variables " + "in the module " + << M.getName() << "\n"); + return false; + } + + bool Changed = handleDeviceScopeSharedVariables(M); + + LLVM_DEBUG(dbgs() << "===== Done with hanlding device scope shared variables " + "in the module " + << M.getName() << "\n"); + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -40,6 +40,7 @@ static bool EnableLateStructurizeCFG; static bool EnableFunctionCalls; static bool EnableFixedFunctionABI; + static bool EnableDeviceScopeSharedVariable; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -198,6 +198,12 @@ cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden); +static cl::opt EnableDeviceScopeSharedVariable( + "amdgpu-enable-device-scope-shared-variable", + cl::desc("Support amdgpu device scope shared variables"), + cl::location(AMDGPUTargetMachine::EnableDeviceScopeSharedVariable), + cl::init(false), cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -265,6 +271,7 @@ initializeGCNRegBankReassignPass(*PR); initializeGCNNSAReassignPass(*PR); initializeSIAddIMGInitPass(*PR); + initializeAMDGPUDeviceScopeSharedVariablePass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -394,6 +401,7 @@ bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; bool AMDGPUTargetMachine::EnableFunctionCalls = false; bool AMDGPUTargetMachine::EnableFixedFunctionABI = false; +bool AMDGPUTargetMachine::EnableDeviceScopeSharedVariable = false; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; @@ -771,6 +779,11 @@ // but EarlyCSE can do neither of them. if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses) addEarlyCSEOrGVNPass(); + + // We expect to run this pass as a last IR pass. Hence make sure that this + // pass is added as a last IR pass + if (EnableDeviceScopeSharedVariable) + addPass(createAMDGPUDeviceScopeSharedVariablePass()); } void AMDGPUPassConfig::addCodeGenPrepare() { diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -48,6 +48,7 @@ AMDGPUAtomicOptimizer.cpp AMDGPUCallLowering.cpp AMDGPUCodeGenPrepare.cpp + AMDGPUDeviceScopeSharedVariable.cpp AMDGPUExportClustering.cpp AMDGPUFixFunctionBitcasts.cpp AMDGPUFrameLowering.cpp