diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -271,6 +271,10 @@ void initializeGCNNSAReassignPass(PassRegistry &); extern char &GCNNSAReassignID; +ModulePass *createAMDGPUDeviceScopeSharedVariablePass(); +void initializeAMDGPUDeviceScopeSharedVariablePass(PassRegistry &); +extern char &AMDGPUDeviceScopeSharedVariableID; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -123,13 +123,15 @@ // should only appear when IPO passes manages to move LDs defined in a kernel // into a single user function. - for (GlobalVariable &GV : M.globals()) { - // TODO: Region address - unsigned AS = GV.getAddressSpace(); - if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) - continue; - - recursivelyVisitUsers(GV, FuncsToAlwaysInline); + if (!AMDGPUTargetMachine::EnableDeviceScopeSharedVariable) { + for (GlobalVariable &GV : M.globals()) { + // TODO: Region address + unsigned AS = GV.getAddressSpace(); + if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) + continue; + + recursivelyVisitUsers(GV, FuncsToAlwaysInline); + } } if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp @@ -0,0 +1,1057 @@ +//===-- AMDGPUDeviceScopeSharedVariables.cpp ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// TODO: +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include +#include +#include + +#define DEBUG_TYPE "amdgpu-device-scope-shared-variable" + +using namespace llvm; + +namespace { + +class AMDGPUDeviceScopeSharedVariable : public ModulePass { +public: + static char ID; + + AMDGPUDeviceScopeSharedVariable() : ModulePass(ID) { + initializeAMDGPUDeviceScopeSharedVariablePass( + *PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; +}; + +} // namespace + +char AMDGPUDeviceScopeSharedVariable::ID = 0; + +char &llvm::AMDGPUDeviceScopeSharedVariableID = + AMDGPUDeviceScopeSharedVariable::ID; + +ModulePass *llvm::createAMDGPUDeviceScopeSharedVariablePass() { + return new AMDGPUDeviceScopeSharedVariable(); +} + +INITIALIZE_PASS_BEGIN(AMDGPUDeviceScopeSharedVariable, + "implement-amdgpu-device-scope-shared-variable", + "Implement AMDPGU Device Scope Shared Variable", + false /*only look at the cfg*/, false /*analysis pass*/) +INITIALIZE_PASS_DEPENDENCY(AMDGPUAlwaysInline) +INITIALIZE_PASS_DEPENDENCY(SimpleInliner) +INITIALIZE_PASS_END(AMDGPUDeviceScopeSharedVariable, + "implement-amdgpu-device-scope-shared-variable", + "Implement AMDPGU Device Scope Shared Variable", + false /*only look at the cfg*/, false /*analysis pass*/) + +static void updateKernelToCallieList( + ValueMap> &KernelToCallie, + ValueMap &OldCallieToNewCallie) { + for (auto KI = KernelToCallie.begin(), KE = KernelToCallie.end(); KI != KE; + ++KI) { + auto *K = KI->first; + auto OldCallieList = KI->second; + std::set NewCallieList; + for (auto *OldCallie : OldCallieList) + if (OldCallieToNewCallie.find(OldCallie) != OldCallieToNewCallie.end()) + NewCallieList.insert(OldCallieToNewCallie[OldCallie]); + else + NewCallieList.insert(OldCallie); + KernelToCallie[K] = NewCallieList; + } +} + +static void +updateLDSToFunctionMap(ValueMap &LDSToFunction, + ValueMap &OldCallieToNewCallie) { + for (auto LI = LDSToFunction.begin(), LE = LDSToFunction.end(); LI != LE; + ++LI) { + auto *LDS = LI->first; + auto *OldF = LI->second; + if (OldCallieToNewCallie.find(OldF) != OldCallieToNewCallie.end()) + LDSToFunction[LDS] = OldCallieToNewCallie[OldF]; + } +} + +static void createFunctionToLDSMap( + ValueMap &LDSToFunction, + ValueMap> &FunctionToLDS) { + for (auto LI = LDSToFunction.begin(), LE = LDSToFunction.end(); LI != LE; + ++LI) { + auto *LDSGlobal = LI->first; + auto *F = LI->second; + auto FI = FunctionToLDS.find(F); + if (FI == FunctionToLDS.end()) { + SetVector LDSSet; + LDSSet.insert(LDSGlobal); + FunctionToLDS[F] = LDSSet; + } else + FunctionToLDS[F].insert(LDSGlobal); + } +} + +static void pairUpKernelWithLDSList( + Function *K, ValueMap> &KernelToCallie, + ValueMap> &FunctionToLDS, + ValueMap> &KernelToDirectLDS, + ValueMap> &KernelToIndirectLDS) { + // If direct LDS globals exist within the kernel, collect it + if (FunctionToLDS.find(K) != FunctionToLDS.end()) + KernelToDirectLDS[K] = FunctionToLDS[K]; + + // Collect all the indirect LDS globals defined within the callie(s) of the + // kernel + SetVector IndirectLDSSet; + auto Callies = KernelToCallie[K]; + for (auto *Callie : Callies) { + if (FunctionToLDS.find(Callie) == FunctionToLDS.end()) + continue; + SetVector CallieLDSList = FunctionToLDS[Callie]; + for (auto *CallieLDS : CallieLDSList) + IndirectLDSSet.insert(CallieLDS); + } + if (!IndirectLDSSet.empty()) + KernelToIndirectLDS[K] = IndirectLDSSet; +} + +static void pairUpKernelWithCallieList( + Module &M, Function *K, + ValueMap> &FunctionToLDS, + ValueMap> &KernelToCallie) { + // Get the call graph node associated with current kernel, traverse the call + // graph associated with it in DFS manner and collect all the associated + // callies which define LDS global(s) + auto CG = CallGraph(M); + auto *KernCGNode = CG[K]; + SmallVector CGNodeStack; + SetVector Visited; + +#ifndef NDEBUG + assert(KernCGNode && "Call graph node associated with kernel definition " + "cannot be null\n"); +#endif + + for (auto KI = KernCGNode->begin(), KE = KernCGNode->end(); KI != KE; ++KI) { + auto *CGN = KI->second; +#ifndef NDEBUG + assert(CGN && "Call graph node associated with function definition cannot" + " be null\n"); +#endif + CGNodeStack.push_back(CGN); + } + + std::set CallieSet; + while (!CGNodeStack.empty()) { + auto *CGNode = CGNodeStack.pop_back_val(); + if (!Visited.insert(CGNode)) + continue; + + auto *F = CGNode->getFunction(); + if (!F || F->isDeclaration()) { +#ifndef NDEBUG + assert(CGNode->empty() && "Call graph node associated with function " + "declaration should not have callie list\n"); +#endif + continue; + } + + if (FunctionToLDS.find(F) != FunctionToLDS.end()) + CallieSet.insert(F); + + for (auto CI = CGNode->begin(), CE = CGNode->end(); CI != CE; ++CI) { + auto *CGN = CI->second; +#ifndef NDEBUG + assert(CGN && "Call graph node associated with function definition cannot" + " be null\n"); +#endif + CGNodeStack.push_back(CGN); + } + } + + KernelToCallie[K] = CallieSet; +} + +static void pairUpLDSGlobalWithItsAssociatedFunction( + GlobalVariable *LDSGlobal, + ValueMap &LDSToFunction) { + // Recursively visit the user list of current LDS global, and find the + // enclosing function where the LDS global is defined, and the enclosing + // function should always be successfully found. + // + // TODO: Is there any other efficient way to find the enclosing functions of + // LDS globals? +#ifndef NDEBUG + assert(!LDSGlobal->user_empty() && + "LDS Global user list cannot be empty since it must have been defined " + "within either kernel or device function"); +#endif + SmallVector UserStack; + SetVector Visited; + + for (auto *U : LDSGlobal->users()) + UserStack.push_back(U); + + while (!UserStack.empty()) { + auto *U = UserStack.pop_back_val(); + if (!Visited.insert(U)) + continue; + + if (auto *I = dyn_cast(U)) { + auto *F = I->getParent()->getParent(); + if (F) { + LDSToFunction[LDSGlobal] = F; + return; + } + continue; + } + + for (auto *UU : U->users()) + UserStack.push_back(UU); + } +#ifndef NDEBUG + assert(false && "Control is not expected to reach this point"); +#endif +} + +static void +getLDSGlobalSizeInBytes(Module &M, GlobalVariable *LDSGlobal, + ValueMap &LDSToSize) { + auto *Ty = LDSGlobal->getValueType(); + auto &DL = M.getDataLayout(); + auto SizeInBytes = DL.getTypeSizeInBits(Ty).getFixedSize() / 8; + LDSToSize[LDSGlobal] = SizeInBytes; +} + +static void +eraseOldCallies(ValueMap &OldCallieToNewCallie) { + // TODO: May be we can come-up with a more efficient implmentation erase old + // callies from the module. It depends on how many callies that we land-up + // erasing in a real world hip application. May be not many, hence, as of now, + // we have just employed a simplest method of repeatedly visiting the old + // callies and removing each of them, once their number of uses become 0. + bool Loopover = true; + while (Loopover) { + Loopover = false; + for (auto OI = OldCallieToNewCallie.begin(), + OE = OldCallieToNewCallie.end(); + OI != OE; ++OI) { + auto *OldCallie = OI->first; + if (OldCallie->getNumUses()) + continue; + OldCallieToNewCallie.erase(OI); + OldCallie->eraseFromParent(); + Loopover = true; + } + } +} + +// TODO: This really looks like a horrible hack to me, but, is there any better +// way of handling `ConstantExprs`? I have no idea at the moment, need to +// revisit it later. +static Instruction *replaceConstExprByInst(ConstantExpr *CE) { + for (auto *U : CE->users()) { + auto *I = dyn_cast(U); + + if (!I) + I = replaceConstExprByInst(dyn_cast(U)); + + if (I) { + auto *NI = CE->getAsInstruction(); + NI->insertBefore(I); + unsigned Ind = 0; + for (Use &UU : I->operands()) { + Value *V = UU.get(); + if (V == CE) { + I->setOperand(Ind, NI); + break; + } + ++Ind; + } + return NI; + } + } + + return nullptr; +} + +static void replaceInstWhichUsesLDS(GlobalVariable *LDS, Value *BasePtr, + Function *F, Instruction *I, + SetVector &ToBeErasedInsts) { + // Assert that the function associated with the `I` is nonthing but the + // one where LDS global is actually defined. +#ifndef NDEBUG + assert(I->getParent()->getParent() == F && + "The reference to LDS should only exists within the function where it " + "is actually defined\n"); +#endif + + // Suffix the names of the instructions with unique integer values + static uint64_t Suffix = 0; + ++Suffix; + + // The new instruction which replaces `UserInst`. + Instruction *NewI = nullptr; + + switch (I->getOpcode()) { + case Instruction::GetElementPtr: { + // 1. Extract the last operand of `GEPInst`, say, it is, `Offset` + // 2. Create pointer arithmetic instruction `BasePtr + Offset` + auto *GEPInst = dyn_cast(I); +#ifndef NDEBUG + assert(GEPInst->hasIndices() && "Expected one or more GEP indecies\n"); +#endif + Value *Offset = GEPInst->getOperand(GEPInst->getNumIndices()); + NewI = GetElementPtrInst::CreateInBounds( + GEPInst->getResultElementType(), BasePtr, Offset, + Twine(BasePtr->getName()) + Twine(".ptr.arith.") + Twine(Suffix), I); + break; + } + case Instruction::Load: { + auto *LInst = dyn_cast(I); + NewI = new LoadInst( + LInst->getType(), BasePtr, + Twine(BasePtr->getName()) + Twine(".load.") + Twine(Suffix), I); + break; + } + case Instruction::Store: { + auto *SInst = dyn_cast(I); + NewI = new StoreInst(SInst->getValueOperand(), BasePtr, I); + break; + } + case Instruction::PtrToInt: { + auto *PToIInst = dyn_cast(I); + NewI = new PtrToIntInst( + BasePtr, PToIInst->getType(), + Twine(BasePtr->getName()) + Twine(".ptoi.") + Twine(Suffix), I); + break; + } + case Instruction::BitCast: { + auto *BCInst = dyn_cast(I); + // TODO: When `BasePtr` type is same as `BCInst` type, is there any + // efficient way to handle it? + NewI = new BitCastInst( + BasePtr, BCInst->getDestTy(), + Twine(BasePtr->getName()) + Twine(".bitcast.") + Twine(Suffix), I); + break; + } + case Instruction::PHI: { + auto *PhiInst = dyn_cast(I); + + // New PHI value to be replaced with. + Value *NewPhiValue = BasePtr; + if (PhiInst->getType() != BasePtr->getType()) { + Instruction *InsertBefore = nullptr; + if (auto *II = dyn_cast(BasePtr)) + InsertBefore = II->getNextNode(); + else + InsertBefore = &*F->getEntryBlock().getFirstInsertionPt(); + + NewPhiValue = new BitCastInst(BasePtr, PhiInst->getType(), + Twine(BasePtr->getName()) + + Twine(".bitcast.") + Twine(Suffix), + InsertBefore); + } + + // New PHI instruction which replaces `PhiInst`. + auto *NewPhiInst = PHINode::Create( + PhiInst->getType(), 0, + Twine(BasePtr->getName()) + Twine(".phi.") + Twine(Suffix), I); + + // Add PHI values to new PHI instruction. + for (unsigned i = 0; i < PhiInst->getNumIncomingValues(); ++i) { + auto *V = PhiInst->getIncomingValue(i); + auto *IBB = PhiInst->getIncomingBlock(i); + if (V == LDS) + NewPhiInst->addIncoming(NewPhiValue, IBB); + else + NewPhiInst->addIncoming(V, IBB); + } + + NewI = NewPhiInst; + break; + } + default: + llvm_unreachable("Not Implemented."); // TODO: What else is missing? + } + + // If `NewI` is created, then, replace `I` by `NewI`, erase `I`, and mark `I` + // as `to be erased` instruction. +#ifndef NDEBUG + assert(NewI && "Valid instruction expected"); +#endif + NewI->copyMetadata(*I); + I->replaceAllUsesWith(NewI); + ToBeErasedInsts.insert(I); +} + +static void updateFunctionAssociatedWithLDS(GlobalVariable *LDS, Value *BasePtr, + Function *F) { + // Keep track of all the erased to be instructions. + SetVector ToBeErasedInsts; + + // Traverse through each `use` of `LDS`, create a new to be replaced value + // for each use case, and accordingly replace it with new one. + for (auto *U : LDS->users()) { + // `U` may be using `LDS`, but 'U` itself is not used anywhere, ignore `U`. + if (!U->getNumUses()) + continue; + + // Cast away const-ness from `U`. + User *UU = const_cast(U); + + if (auto *I = dyn_cast(UU)) { + replaceInstWhichUsesLDS(LDS, BasePtr, F, I, ToBeErasedInsts); + } else if (auto *CE = dyn_cast(UU)) { + // TODO: There could be some performance impact here since we convert + // constant expressions into explicit instructions. Need to revisit it + // later, constant folding? + auto *I = replaceConstExprByInst(CE); + replaceInstWhichUsesLDS(LDS, BasePtr, F, I, ToBeErasedInsts); + CE->removeDeadConstantUsers(); + } else + llvm_unreachable("Not Implemented."); // TODO: What else is missing? + } + + // Erase all the instructions which are got replaced by new ones. + for (auto *I : ToBeErasedInsts) + I->eraseFromParent(); +} + +static void getNewArgumentList( + GlobalVariable *LDS, + std::map> + &KernelToIndirectBasePtrInst, + Function *Caller, CallInst *CI, SmallVectorImpl &NewArgs) { + Value *NewArg = nullptr; + + if (Caller->getCallingConv() == CallingConv::AMDGPU_KERNEL) { + std::map LDSToIndirectBasePtrInst = + KernelToIndirectBasePtrInst[Caller]; + NewArg = LDSToIndirectBasePtrInst[LDS]; + } else + NewArg = Caller->getArg(Caller->arg_size() - 1); + + for (auto AI = CI->arg_begin(), AE = CI->arg_end(); AI != AE; ++AI) + NewArgs.push_back(*AI); + NewArgs.push_back(NewArg); +} + +static bool +isNewClonedFunction(Function *F, + ValueMap &OldCallieToNewCallie) { + for (auto OI = OldCallieToNewCallie.begin(), OE = OldCallieToNewCallie.end(); + OI != OE; ++OI) { + auto *NF = OI->second; + if (F == NF) + return true; + } + return false; +} + +static void +updateCallSites(GlobalVariable *LDS, Function *OldCallie, Function *NewCallie, + std::map> + &KernelToIndirectBasePtrInst, + ValueMap &OldCallieToNewCallie) { + // Update call sites for current callie. + for (auto *U : OldCallie->users()) { + // Get call instruction. + auto *CI = dyn_cast(U); +#ifndef NDEBUG + assert(CI && "Valid call instruction expected"); +#endif + + // We are only interested in the call sites within kernel or within new + // cloned functions. + auto *Caller = CI->getParent()->getParent(); + if (Caller->getCallingConv() != CallingConv::AMDGPU_KERNEL && + !isNewClonedFunction(Caller, OldCallieToNewCallie)) + continue; + + // Get new argument list which can be used to insert new call instruction. + SmallVector NewArgs; + getNewArgumentList(LDS, KernelToIndirectBasePtrInst, Caller, CI, NewArgs); + + // Insert new call instruction `NewCI` just before the existing call + // instruction `CI`. + auto *NewCI = CallInst::Create(NewCallie->getFunctionType(), NewCallie, + NewArgs, Twine(""), CI); + // TODO: Why copyMetadata() not copying meta data. I see metadat associated + // with CI, but it is not copied to NewCI. CI->hasMetadata() is false, why? + NewCI->copyMetadata(*CI); + NewCI->setTailCall(CI->isTailCall()); + NewCI->setCallingConv(CI->getCallingConv()); + + // Now, since new updated call instruction is in place, delete old one. + CI->replaceAllUsesWith(NewCI); + CI->eraseFromParent(); + } +} + +static void +updateCallSites(GlobalVariable *LDS, + std::map> + &KernelToIndirectBasePtrInst, + ValueMap &OldCallieToNewCallie) { + // Update call sites for all callies. + for (auto OI = OldCallieToNewCallie.begin(), OE = OldCallieToNewCallie.end(); + OI != OE; ++OI) { + auto *OldCallie = OI->first; + auto *NewCallie = OI->second; + updateCallSites(LDS, OldCallie, NewCallie, KernelToIndirectBasePtrInst, + OldCallieToNewCallie); + } +} + +static Function *cloneCallie(Module &M, Type *NewParamType, Function *Callie) { + // Create a new function type by adding `NewParamType` to the end of existing + // parameter list. + SmallVector NewParams; + auto *FnTy = Callie->getFunctionType(); + for (auto PI = FnTy->param_begin(), PE = FnTy->param_end(); PI != PE; ++PI) + NewParams.push_back(*PI); + NewParams.push_back(NewParamType); + auto *NewFnTy = + FunctionType::get(FnTy->getReturnType(), NewParams, FnTy->isVarArg()); + + // Create a copy of the `Callie`, but with new function type + auto *NewCallie = + Function::Create(NewFnTy, Callie->getLinkage(), Callie->getAddressSpace(), + Callie->getName() + Twine(".c")); + + // TODO: what does this map required for? + ValueToValueMapTy VMap; + auto *NewCallieArgIt = NewCallie->arg_begin(); + for (auto &Arg : Callie->args()) { + auto ArgName = Arg.getName(); + NewCallieArgIt->setName(ArgName); + VMap[&Arg] = &(*NewCallieArgIt++); + } + + // TODO: ModuleLevelChanges should be set to true or false? + SmallVector Returns; + CloneFunctionInto(NewCallie, Callie, VMap, /*ModuleLevelChanges=*/false, + Returns); + + // Copy all metadata + SmallVector, 1> MDs; + Callie->getAllMetadata(MDs); + for (auto MDIt : MDs) + NewCallie->addMetadata(MDIt.first, *MDIt.second); + + // Insert `NewCallie` just before `Callie` within the module. + M.getFunctionList().insert(Callie->getIterator(), NewCallie); + + return NewCallie; +} + +static void CollectCallGraphPathsBetweenKernelAndCallie( + Module &M, Function *K, Function *Callie, + SmallVectorImpl> &CGPaths) { + // Traverse the call graph associated with the kernel in DFS manner and + // collect all the paths from kernel to callie. + // + // TODO: Note that this algorithm will not work if there exist recursive + // calls, and the current assumption here is that the call graph is acyclic. + // We need to visit it back again to handle call graph which could contain + // cycles. + auto CG = CallGraph(M); + auto *KernCGNode = CG[K]; +#ifndef NDEBUG + assert(KernCGNode && "Call graph node associated with kernel definition " + "cannot be null\n"); +#endif + + SmallVector, 8> Stack; + SetVector Path; + Path.insert(KernCGNode); + Stack.push_back(Path); + + while (!Stack.empty()) { + auto TopPath = Stack.pop_back_val(); + auto *CGNode = TopPath.back(); + auto *F = CGNode->getFunction(); + if (F == Callie) { + SetVector FPath; + for (auto *CGN : TopPath) + FPath.insert(CGN->getFunction()); + CGPaths.push_back(FPath); + continue; + } + + for (auto NI = CGNode->begin(), NE = CGNode->end(); NI != NE; ++NI) { + auto *CGN = NI->second; +#ifndef NDEBUG + assert(CGN && "Call graph node associated with function definition cannot" + " be null\n"); +#endif + SetVector ClonedPath(TopPath.begin(), TopPath.end()); + ClonedPath.insert(CGN); + Stack.push_back(ClonedPath); + } + } +} + +static void createCloneOfCalliesWithNewParameter( + Module &M, GlobalVariable *LDS, ValueMap &KToC, + Type *BasePtrType, ValueMap &OldCallieToNewCallie) { + // Update callies to accept the new parameter which is of type `BasePtrType` + // by creating their clones. Here is the brief sketch of the functionality of + // this function: + // + // 1. Collect all the call graph paths between the kernels and the callies + // 2. Traverse all the call graph paths from kernels to callies. + // 3. For each device function encoutered while traversing, create a clone of + // it, by adding a new parameter of type BasePtr`s type to it's parameter + // list, but also retain the original device function for a moment. + SmallVector, 8> CGPaths; + for (auto KI = KToC.begin(), KE = KToC.end(); KI != KE; ++KI) + CollectCallGraphPathsBetweenKernelAndCallie(M, KI->first, KI->second, + CGPaths); + + for (auto CGPath : CGPaths) { + // TODO: We can in-fact assert that length of `CGPath` is atleast two. But, + // we are okay for now. + if (CGPath.size() < 2) + continue; + + // We are interested in cloning of only device functions in the call graph + // path, hence we start from second node in the call graph path. + auto PI = CGPath.begin() + 1; + auto PE = CGPath.end(); + for (; PI != PE; ++PI) { + auto *CurCallie = *PI; + + if (OldCallieToNewCallie.find(CurCallie) != OldCallieToNewCallie.end()) + continue; + + // The `CurCallie` is encountered first time for the LDS in question, + // Create a clone of it, and save it. + auto *NewCallie = cloneCallie(M, BasePtrType, CurCallie); + OldCallieToNewCallie[CurCallie] = NewCallie; + } + } +} + +static Type *getBasePtrAccessInstType( + GlobalVariable *LDS, + std::map> + &KernelToIndirectBasePtrInst) { + for (auto KI = KernelToIndirectBasePtrInst.begin(), + KE = KernelToIndirectBasePtrInst.end(); + KI != KE; ++KI) + for (auto BPI = KI->second.begin(), BPE = KI->second.end(); BPI != BPE; + ++BPI) + if (LDS == BPI->first) + return BPI->second->getType(); + +#ifndef NDEBUG + assert(false && "Control is not expected to reach this point"); +#endif + return nullptr; +} + +static void +getKernelToCallieMap(GlobalVariable *LDS, + ValueMap &LDSToFunction, + ValueMap> &KernelToCallie, + ValueMap &KToC) { + // Collect all call graph paths from kernels to end-callies which are + // associated with current LDS. + auto *Callie = LDSToFunction[LDS]; + for (auto KI = KernelToCallie.begin(), KE = KernelToCallie.end(); KI != KE; + ++KI) { + auto *K = KI->first; + auto Callies = KI->second; + if (Callies.find(Callie) != Callies.end()) + KToC[K] = Callie; + } +} + +static bool handleIndirectLDSGlobals( + Module &M, ValueMap &LDSToFunction, + ValueMap> &KernelToCallie, + ValueMap> &KernelToIndirectLDS, + std::map> + &KernelToIndirectBasePtrInst) { + // Construct a grand list of all indirect LDS globals. + SetVector GrandLDSList; + for (auto KI = KernelToIndirectLDS.begin(), KE = KernelToIndirectLDS.end(); + KI != KE; ++KI) + for (auto *LDS : KI->second) + GrandLDSList.insert(LDS); + + // Process each indirect LDS global one by one. + for (auto *LDS : GrandLDSList) { + // For the current indirect LDS, pair-up kernel and end-callie. + ValueMap KToC; + getKernelToCallieMap(LDS, LDSToFunction, KernelToCallie, KToC); + + // For the current indirect LDS, get the base pointer access instruction + // type. + auto *BasePtrType = + getBasePtrAccessInstType(LDS, KernelToIndirectBasePtrInst); + + // Create clones of all those functions which exist within the call graph + // paths from kernels to end-callie so that they accept new argument + // associated with current indirect LDS global. + ValueMap OldCallieToNewCallie; + createCloneOfCalliesWithNewParameter(M, LDS, KToC, BasePtrType, + OldCallieToNewCallie); + + // Update all the required data structures to point to the new cloned + // functions in place of their old counterparts. + updateLDSToFunctionMap(LDSToFunction, OldCallieToNewCallie); + updateKernelToCallieList(KernelToCallie, OldCallieToNewCallie); + + // Update all call sites of the all old functions being cloned so that calls + // are being made to new cloned functions instead to old functions. + updateCallSites(LDS, KernelToIndirectBasePtrInst, OldCallieToNewCallie); + + // By now, all old functions are dead without any reference being made to + // them, erase them now from the module. + eraseOldCallies(OldCallieToNewCallie); + +#ifndef NDEBUG + assert(OldCallieToNewCallie.empty() && + "None of the old callies should alive by now\n"); +#endif + + // Finally, update the end-callie where the current LDS was originally + // defined, so that all the references to this LDS within this end-callie + // are appropriately replaced. + auto *F = LDSToFunction[LDS]; + updateFunctionAssociatedWithLDS(LDS, F->getArg(F->arg_size() - 1), F); + } + + return true; +} + +static bool handleDirectLDSGlobals( + Module &M, std::map> + &KernelToDirectBasePtrInst) { + // Go through each kernel one by one, and handle all the direct globals + // defined within each of them. + for (auto KI = KernelToDirectBasePtrInst.begin(), + KE = KernelToDirectBasePtrInst.end(); + KI != KE; ++KI) { + auto *K = KI->first; + auto DirectLDSToBasePtrInst = KI->second; + + for (auto LI = DirectLDSToBasePtrInst.begin(), + LE = DirectLDSToBasePtrInst.end(); + LI != LE; ++LI) { + auto *LDS = LI->first; + auto *BasePtr = LI->second; + updateFunctionAssociatedWithLDS(LDS, BasePtr, K); + } + } + + return true; +} + +static Instruction *insertBasePointerAccessInstructionsWithinKernel( + Module &M, Function *K, GlobalVariable *LDS, GlobalVariable *NewLDS, + uint64_t Offset) { + // Insert instructions as below at the begining of the entry basic block of + // the kernel + // 1. Insert GEP instruction which access the address `NewLDS + Offset`, say, + // result is `GEPInst` which is of type `char*`. + // 2. Insert type cast instruction which type casts `GEPInst` from `char*` to + // `basetype*` where `basetype` is base type of `LDS`, say the result is, + // `CastInst`. + // 3. Return `CastInst`. + + // Suffix the names of the instructions with unique integer values + static int Suffix = 0; + ++Suffix; + + // Insert gep instruction + auto BI = K->getEntryBlock().getFirstInsertionPt(); +#ifndef NDEBUG + assert(BI != K->getEntryBlock().end() && + "Entry basic block of the kernel cannot be empty, otherwise control " + "would not reach this point\n"); +#endif + auto &EI = *BI; + Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(M.getContext())), + Constant::getIntegerValue( + Type::getInt64Ty(M.getContext()), APInt(64, Offset))}; + Instruction *GEPInst = GetElementPtrInst::CreateInBounds( + NewLDS->getValueType(), const_cast(NewLDS), Indices, + Twine("dssv.gep.") + Twine(Suffix), const_cast(&EI)); + + // Insert type-cast instruction + // TODO: Do we need to handle any other aggregate type apart from array type? + // and, what about 2 and other higher dimensional arrays? + auto *LDSValueType = LDS->getValueType(); + if (auto *AT = dyn_cast(LDSValueType)) + LDSValueType = AT->getElementType(); + auto *ToBeCastedType = + PointerType::get(LDSValueType, AMDGPUAS::LOCAL_ADDRESS); + Instruction *CastInst = new BitCastInst(GEPInst, ToBeCastedType, + Twine("dssv.cast.") + Twine(Suffix), + const_cast(&EI)); + + // Return type-casted instruction + return CastInst; +} + +static GlobalVariable * +createSingleContiguousLayout(Module &M, Function *K, + uint64_t &TotalLDSSizeInBytes) { + // Insert a new LDS global which is nothing but a single contigeous shared + // memory layout representing all the LDS globals associted with the kernel + // which includes those directly defined within the kernel and those + // indirectly defined within callies. + // + // The size of this new contigeous LDS global layout is equal to the sum of + // the sizes of all the associated LDS globals. + // TODO: what about the name of this new LDS global? is it fine or need to be + // changed? + auto *NewLDSTy = + ArrayType::get(IntegerType::get(M.getContext(), 8), TotalLDSSizeInBytes); + auto *NewLDS = new GlobalVariable( + M, NewLDSTy, false, GlobalValue::InternalLinkage, + UndefValue::get(NewLDSTy), + Twine(K->getName()) + Twine(".Single.LDS.Layout"), nullptr, + GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); + NewLDS->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + NewLDS->setAlignment(MaybeAlign(M.getDataLayout().getPreferredAlign(NewLDS))); + + return NewLDS; +} + +static void +computeTotalLDSSizeInBytes(ValueMap &LDSToSize, + SetVector &DirectLDSList, + SetVector &IndirectLDSList, + ValueMap &LDSToOffset, + uint64_t &TotalLDSSizeInBytes) { + // For the current kernel, compute the total size of all LDS globals, and also + // offsets associated with them within in the new LDS global. + TotalLDSSizeInBytes = 0; + for (auto *LDS : DirectLDSList) { + LDSToOffset[LDS] = TotalLDSSizeInBytes; + TotalLDSSizeInBytes += LDSToSize[LDS]; + } + for (auto *LDS : IndirectLDSList) { + LDSToOffset[LDS] = TotalLDSSizeInBytes; + TotalLDSSizeInBytes += LDSToSize[LDS]; + } +} + +static bool prepareKernelsForHandlingLDSGlobals( + Module &M, SetVector &Kernels, + ValueMap> &KernelToDirectLDS, + ValueMap> &KernelToIndirectLDS, + ValueMap &LDSToSize, + std::map> + &KernelToDirectBasePtrInst, + std::map> + &KernelToIndirectBasePtrInst) { + bool Change = false; + + // For each LDS global, insert base pointer access instruction within + // associated kernel(s). + for (auto *K : Kernels) { + // Copy both direct and indirect LDS list for current kernel. + SetVector DirectLDSList; + SetVector IndirectLDSList; + if (KernelToDirectLDS.find(K) != KernelToDirectLDS.end()) + DirectLDSList = KernelToDirectLDS[K]; + if (KernelToIndirectLDS.find(K) != KernelToIndirectLDS.end()) + IndirectLDSList = KernelToIndirectLDS[K]; + + // No LDS globals to process? ignore the kernel, goto next kernel. + if (DirectLDSList.empty() && IndirectLDSList.empty()) + continue; + + // We are going process LDS globals atleast for one kernel, and hence, we + // are going to make module level changes. + Change = true; + + // Create a single contigeous LDS latout for current kernel + uint64_t TotalLDSSizeInBytes; + ValueMap LDSToOffset; + computeTotalLDSSizeInBytes(LDSToSize, DirectLDSList, IndirectLDSList, + LDSToOffset, TotalLDSSizeInBytes); + auto *NewLDS = createSingleContiguousLayout(M, K, TotalLDSSizeInBytes); + + // For each LDS global (both direct and indirect ones), insert base pointer + // access instructions within kernel. + std::map DirectLDSToBasePtrInst; + std::map IndirectLDSToBasePtrInst; + for (auto *LDS : DirectLDSList) + DirectLDSToBasePtrInst[LDS] = + insertBasePointerAccessInstructionsWithinKernel(M, K, LDS, NewLDS, + LDSToOffset[LDS]); + for (auto *LDS : IndirectLDSList) + IndirectLDSToBasePtrInst[LDS] = + insertBasePointerAccessInstructionsWithinKernel(M, K, LDS, NewLDS, + LDSToOffset[LDS]); + + KernelToDirectBasePtrInst[K] = DirectLDSToBasePtrInst; + KernelToIndirectBasePtrInst[K] = IndirectLDSToBasePtrInst; + } + + return Change; +} + +static bool handleDeviceScopeSharedVariables( + Module &M, SetVector &Kernels, + SetVector &LDSGlobals, + ValueMap &LDSToFunction, + ValueMap> &KernelToCallie, + ValueMap> &KernelToDirectLDS, + ValueMap> &KernelToIndirectLDS, + ValueMap &LDSToSize) { + bool Change = false; + std::map> + KernelToDirectBasePtrInst; + std::map> + KernelToIndirectBasePtrInst; + + // 1. Create a single contigeous LDS global layout for each kernel. + // 2. Compute base pointer offset for each LDS within the above single + // contigeous LDS global layout, and insert it within the associated + // kernel(s). + Change = prepareKernelsForHandlingLDSGlobals( + M, Kernels, KernelToDirectLDS, KernelToIndirectLDS, LDSToSize, + KernelToDirectBasePtrInst, KernelToIndirectBasePtrInst); + + // None of the kernel has any LDS globals (direct and/or indirect ones) + // associated with them. Nothing to do, no changes being made to module. + if (!Change) + return false; + + // Handle all indirect LDS globals defined within device functions. + if (!KernelToIndirectLDS.empty()) + Change = handleIndirectLDSGlobals(M, LDSToFunction, KernelToCallie, + KernelToIndirectLDS, + KernelToIndirectBasePtrInst); + + // Handle all direct LDS globals defined within kernels. + if (!KernelToDirectLDS.empty()) + Change = handleDirectLDSGlobals(M, KernelToDirectBasePtrInst); + + // Now, finally, erase all the original LDS globals from the module. + for (auto *LDS : LDSGlobals) + LDS->eraseFromParent(); + + return Change; +} + +static bool +handleDeviceScopeSharedVariables(Module &M, + SetVector &LDSGlobals, + SetVector &Kernels) { + // Pair up each LDS global with the enclosing function where the LDS global is + // defined + ValueMap LDSToFunction; + for (auto *LDSGlobal : LDSGlobals) + pairUpLDSGlobalWithItsAssociatedFunction(LDSGlobal, LDSToFunction); + + // Create reverse map from enclosing function to LDS global list + ValueMap> FunctionToLDS; + createFunctionToLDSMap(LDSToFunction, FunctionToLDS); + + // Pair up kernels with callie list which define LDS globals + ValueMap> KernelToCallie; + for (auto *K : Kernels) + pairUpKernelWithCallieList(M, K, FunctionToLDS, KernelToCallie); + + // Pair up kernels with all the LDS globals: both direct LDS globals (those + // directly defined within the kernels), and indirect LDS globals (those + // indirectly defined within the callies). + ValueMap> KernelToDirectLDS; + ValueMap> KernelToIndirectLDS; + for (auto *K : Kernels) + pairUpKernelWithLDSList(K, KernelToCallie, FunctionToLDS, KernelToDirectLDS, + KernelToIndirectLDS); + + // Get the size of each LDS global in bytes + ValueMap LDSToSize; + for (auto *LDSGlobal : LDSGlobals) + getLDSGlobalSizeInBytes(M, LDSGlobal, LDSToSize); + + return handleDeviceScopeSharedVariables(M, Kernels, LDSGlobals, LDSToFunction, + KernelToCallie, KernelToDirectLDS, + KernelToIndirectLDS, LDSToSize); +} + +static bool handleDeviceScopeSharedVariables(Module &M) { + // Collect all the (static) LDS globals defined within the current module + SetVector LDSGlobals; + for (auto &GV : M.globals()) + if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + !GV.hasExternalLinkage()) + LDSGlobals.insert(&GV); + + if (LDSGlobals.empty()) { + LLVM_DEBUG(dbgs() << "No LDS globals defined in the module " << M.getName() + << ", skipping handling device of scope shared variables" + << "\n"); + return false; + } + + // Collect all the amdgpu kernels defined within the current module + SetVector Kernels; + for (auto &F : M.functions()) + if ((F.getCallingConv() == CallingConv::AMDGPU_KERNEL) && + !F.isDeclaration()) + Kernels.insert(&F); + + if (Kernels.empty()) { + LLVM_DEBUG(dbgs() << "No kernels defined in the module " << M.getName() + << ", skipping handling of device scope shared variables" + << "\n"); + return false; + } + + return handleDeviceScopeSharedVariables(M, LDSGlobals, Kernels); +} + +bool AMDGPUDeviceScopeSharedVariable::runOnModule(Module &M) { + LLVM_DEBUG(dbgs() << "===== Handling device scope shared variables in the " + "module " + << M.getName() << "\n"); + + // TODO: We only want to handle HIP kernels, and no kernels from from other + // programming languages, like OpenCL, OpenMP, etc. Do we need to add a + // condition here for it, and skip running the pass for non-HIP kernels? + if (skipModule(M)) { + LLVM_DEBUG(dbgs() << "Skipping handling of device scope shared variables " + "in the module " + << M.getName() << "\n"); + return false; + } + + bool Changed = handleDeviceScopeSharedVariables(M); + + LLVM_DEBUG(dbgs() << "===== Done with handling device scope shared variables " + "in the module " + << M.getName() << "\n"); + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -40,6 +40,7 @@ static bool EnableLateStructurizeCFG; static bool EnableFunctionCalls; static bool EnableFixedFunctionABI; + static bool EnableDeviceScopeSharedVariable; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -198,6 +198,12 @@ cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden); +static cl::opt EnableDeviceScopeSharedVariable( + "amdgpu-enable-device-scope-shared-variable", + cl::desc("Support amdgpu device scope shared variables"), + cl::location(AMDGPUTargetMachine::EnableDeviceScopeSharedVariable), + cl::init(false), cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -265,6 +271,7 @@ initializeGCNRegBankReassignPass(*PR); initializeGCNNSAReassignPass(*PR); initializeSIAddIMGInitPass(*PR); + initializeAMDGPUDeviceScopeSharedVariablePass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -394,6 +401,7 @@ bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; bool AMDGPUTargetMachine::EnableFunctionCalls = false; bool AMDGPUTargetMachine::EnableFixedFunctionABI = false; +bool AMDGPUTargetMachine::EnableDeviceScopeSharedVariable = false; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; @@ -771,6 +779,11 @@ // but EarlyCSE can do neither of them. if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses) addEarlyCSEOrGVNPass(); + + // We expect to run this pass as a last IR pass. Hence make sure that this + // pass is added as a last IR pass + if (EnableDeviceScopeSharedVariable) + addPass(createAMDGPUDeviceScopeSharedVariablePass()); } void AMDGPUPassConfig::addCodeGenPrepare() { diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -48,6 +48,7 @@ AMDGPUAtomicOptimizer.cpp AMDGPUCallLowering.cpp AMDGPUCodeGenPrepare.cpp + AMDGPUDeviceScopeSharedVariable.cpp AMDGPUExportClustering.cpp AMDGPUFixFunctionBitcasts.cpp AMDGPUFrameLowering.cpp