diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -271,6 +271,10 @@ void initializeGCNNSAReassignPass(PassRegistry &); extern char &GCNNSAReassignID; +ModulePass *createAMDGPUDeviceScopeSharedVariablePass(); +void initializeAMDGPUDeviceScopeSharedVariablePass(PassRegistry &); +extern char &AMDGPUDeviceScopeSharedVariableID; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -123,13 +123,15 @@ // should only appear when IPO passes manages to move LDs defined in a kernel // into a single user function. - for (GlobalVariable &GV : M.globals()) { - // TODO: Region address - unsigned AS = GV.getAddressSpace(); - if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) - continue; - - recursivelyVisitUsers(GV, FuncsToAlwaysInline); + if (!AMDGPUTargetMachine::EnableDeviceScopeSharedVariable) { + for (GlobalVariable &GV : M.globals()) { + // TODO: Region address + unsigned AS = GV.getAddressSpace(); + if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) + continue; + + recursivelyVisitUsers(GV, FuncsToAlwaysInline); + } } if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp @@ -0,0 +1,1548 @@ +//===-- AMDGPUDeviceScopeSharedVariables.cpp ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// [1]. What does this module pass is about? +// +// This module pass is about an "indirect" method for supporting the "shared" +// variables which are defined within "device" functions. Programmatically +// speaking, a "hacked" way of supporting device scoped shared variables. Yes, +// it is a hacked way, because, it has been become very complicated to support +// device scoped shared variables due to below "main" reason: +// +// A shared variable is a "block" scoped variable, but it's lifetime is same as +// the "work-group" to which it belongs, which necessitates to keep track of +// multiple copies of shared variables related to different work-groups from +// different running kernels, which is too costly and too complex to implement +// given the "scarcity" of the shared memory and "strange properties" of the +// shared variables. +// +// [2]. What does this pass do? +// +// Though, programmer define device scoped shared variables within device +// functions, this module pass, +// +// A. Internally pushes the definitions of all those device scoped shared +// variables within the associated kernel(s). By associated kernels, what +// we mean here is that all those kernels from which there exist call graph +// paths from kernels to device functions within which device scoped shared +// variables are originally defined. +// B. Implements necessary program transformations in order to make sure that +// within kernel defined new and big device scoped shared variable layout +// is accessible within device functions within which those original device +// scoped shared variables were defined. +// +// [3]. How does the implementation of the pass look like in brief? +// +// At a very high level, implementation of this pass can be described as below: +// +// A. For every kernel, traverse through it's call graph, and collect all +// those device functions within which device scoped shared variables are +// defined, and also the corresponding device scoped shared variables. +// B. Create a single big shared memory layout within the kernel by combining +// all those device scoped shared variables which are collected above. +// C. Map each corresponding device scoped shared variable to it's "offset" +// within the above defined corresponding big shared memory layout per +// kernel. +// D. Associate each kernel with unique positive integer starting from 0 which +// we call `kernel number`, and similarly associate each device scoped +// shared variable with unique positive integer again starting from 0 which +// we call `lds number`. +// E. Create a 2D offset table (within constant memory) where each row +// represents different kernel, and each column represents different device +// scoped shared variable, and fill this table with offsets which are +// computed as in above item C. +// F. Within each kernel, type cast the corresponding single big shared memory +// layout to `char*`, and pass this type-casted pointer and the kernel +// number as new function arguments along the call graph path(s) so that +// these are accessible within those device functions within which original +// device scoped shared variables were defined. +// G. Within device functions, replace all the references to original device +// scoped shared variables by their offset counterparts. +// H. Finally, remove all the original device scoped shared variables. +// +// [4]. What are the positive consequences of this pass on the applications? +// +// Is one really exist? +// +// [5]. What are the negative consequences of this pass on the applications? +// +// A. Since we add new arguments to the functions along the call graph paths +// from kernels to device functions, there is a posibility of increased +// register pressure, which may affect performance. +// B. Since we create single big shared memory layouts within kernels, we +// land-up duplicating share memory of indirect shared variables within +// kernels, however, it is still less costlier compare to the direct +// support. +// C. This implementation is indeed a very careful hack, and hence any bug in +// the implementation may have some adverse effect on running application. +// +// [6]. What are some important current limitations of the pass? +// +// A. The implementation assumes that there are no recursive calls to device +// functions (both direct or indirect), and hence call graphs associated +// with kernels are acyclic. Presence of recursive calls may leads to +// undefined beviour of the running application or compilation may itself +// crash. +// B. The handling of the presence of any indirect call(s) completely depends +// on the pass `called-value-propagation` which inserts metadata info at +// indirect call site points. This metadata indicates the set of functions +// the call site could possibly target at runtime. If for some reasons, +// this pass fails to attach above metadata, then, it may leads to +// undefined beviour of the running application, or compilation may itself +// crash. +// C. All TODOs need to be revisited sooner than later. +// +// [7]. An Example. +// +// Before Pass: +// +// __device__ void foo() +// { +// __shared__ char smc[10]; +// __shared__ int smi[10]; +// __shared__ float smf[10]; +// +// smc[1] = 1; +// smi[2] = 2; +// smf[3] = 3.0; +// } +// +// __global__ void kernel() +// { +// foo(); +// } +// +// After Pass: +// +// int offset_table[1][3] = {0, 10, 50}; +// +// __device__ void foo(char *sm, int knum) +// { +// int c_offset = offset_table[knum][0]; +// int i_offset = offset_table[knum][1]; +// int f_offset = offset_table[knum][2]; +// +// char *c_addr = (char*)(sm + c_offset); +// int *i_addr = (int*)(sm + i_offset); +// float *f_addr = (float*)(sm + f_offset); +// +// c_addr[1] = 1; +// i_addr[2] = 2; +// f_addr[3] = 3.0; +// } +// +// __global__ void kernel() +// { +// __shared__ char sm[90]; // assuming char occupies 1 byte, int +// // occupies 4 bytes, and float occupies 4 +// // bytes. +// foo(sm, 0); // 0 is kernel number. +// } +// +// NOTE: This pass is disabled by default, and enabled with the AMDGPU back-end +// option `--amdgpu-enable-device-scope-shared-variable=true`. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include +#include + +#define DEBUG_TYPE "amdgpu-device-scope-shared-variable" + +using namespace llvm; + +namespace { + +class AMDGPUDeviceScopeSharedVariable : public ModulePass { +public: + static char ID; + + AMDGPUDeviceScopeSharedVariable() : ModulePass(ID) { + initializeAMDGPUDeviceScopeSharedVariablePass( + *PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; +}; + +} // namespace + +char AMDGPUDeviceScopeSharedVariable::ID = 0; + +char &llvm::AMDGPUDeviceScopeSharedVariableID = + AMDGPUDeviceScopeSharedVariable::ID; + +ModulePass *llvm::createAMDGPUDeviceScopeSharedVariablePass() { + return new AMDGPUDeviceScopeSharedVariable(); +} + +INITIALIZE_PASS(AMDGPUDeviceScopeSharedVariable, + "implement-amdgpu-device-scope-shared-variable", + "Implement AMDPGU Device Scope Shared Variable", + false /*only look at the cfg*/, false /*analysis pass*/) + +static unsigned getTypeStoreSizeInBits(Module &M, Type *Ty) { + return M.getDataLayout().getTypeSizeInBits(Ty).getFixedSize(); +} + +static unsigned getTypeStoreSizeInBytes(Module &M, Type *Ty) { + return getTypeStoreSizeInBits(M, Ty) / 8; +} + +// TODO: This really looks like a horrible hack to me, but, is there any better +// way of handling `ConstantExprs`? I have no idea at the moment, need to +// revisit it later. +static Instruction *replaceConstExprByInst(ConstantExpr *CE) { + for (auto *U : CE->users()) { + auto *I = dyn_cast(U); + + if (!I) + I = replaceConstExprByInst(dyn_cast(U)); + + if (I) { + auto *NI = CE->getAsInstruction(); + NI->insertBefore(I); + unsigned Ind = 0; + for (Use &UU : I->operands()) { + Value *V = UU.get(); + if (V == CE) + I->setOperand(Ind, NI); + ++Ind; + } + return NI; + } + } + + return nullptr; +} + +static void replaceInstWhichUsesLDS(Module &M, GlobalVariable *LDS, + Value *LDSAccessInst, Function *F, + Instruction *I, + SetVector &ToBeErasedInsts) { + // The function associated with the `I` shoud be the one where `LDS` is + // actually defined. +#ifndef NDEBUG + assert(I->getParent()->getParent() == F && + "The reference to LDS should only exists within the function where it " + "is actually defined\n"); +#endif + + // Create clone of `I`, say, it is `NewI`. Within `NewI`, replace the use(s) + // of `LDS` by `LDSAccessInst`. + Instruction *NewI = I->clone(); + unsigned Ind = 0; + for (Use &UU : NewI->operands()) { + Value *V = UU.get(); + if (V == LDS) + NewI->setOperand(Ind, LDSAccessInst); + ++Ind; + } + + // Insert `NewI` just before `I`, replace all uses of `I` by `NewI` and mark + // `I` as `to be erased`instruction. + NewI->insertBefore(I); + NewI->copyMetadata(*I); + I->replaceAllUsesWith(NewI); + ToBeErasedInsts.insert(I); +} + +static void updateAllUsersOfOriginalLDS(Module &M, GlobalVariable *LDS, + Function *F, + Instruction *LDSAccessInst) { + // Keep track of all the erased to be instructions. + SetVector ToBeErasedInsts; + + // Traverse through each `use` of `LDS`, create a new to be replaced value + // for each use case, and accordingly replace it with new one. + for (auto *U : LDS->users()) { + // `U` may be using `LDS`, but 'U` itself is not used anywhere, ignore `U`. + if (!U->getNumUses()) + continue; + + // Cast away const-ness from `U`. + User *UU = const_cast(U); + + if (auto *I = dyn_cast(UU)) { + replaceInstWhichUsesLDS(M, LDS, LDSAccessInst, F, I, ToBeErasedInsts); + } else if (auto *CE = dyn_cast(UU)) { + // TODO: Performance issues because of converting `ConstExpr` to + // `Instruction`? + auto *I = replaceConstExprByInst(CE); + replaceInstWhichUsesLDS(M, LDS, LDSAccessInst, F, I, ToBeErasedInsts); + CE->removeDeadConstantUsers(); + } else + llvm_unreachable("Not Implemented."); // TODO: What else is missing? + } + + // Erase all the instructions which are got replaced by new ones. + for (auto *I : ToBeErasedInsts) + I->eraseFromParent(); +} + +static Instruction *insertInstructionsToAccessNewLDSLayout( + Module &M, GlobalVariable *LDS, Function *F, + std::map &LDSToID, + GlobalVariable *LDSOffsetTable) { + // Suffix the names of the instructions with unique integer values. + static int Suffix = 0; + ++Suffix; + + // Get the first insert point of the entry basic block of current kernel. + auto BI = F->getEntryBlock().getFirstInsertionPt(); +#ifndef NDEBUG + assert(BI != F->getEntryBlock().end() && + "Entry basic block of the function cannot be empty, otherwise control " + "would not reach this point\n"); +#endif + auto &EI = *BI; + + // Get LDS offset table indicies. + auto *KIndex = F->getArg(F->arg_size() - 1); + auto *LIndex = Constant::getIntegerValue(Type::getInt64Ty(M.getContext()), + APInt(64, LDSToID[LDS])); + + // Insert GEP instruction which access the address + // `((LDSOffsetTable + KIndex) + LIndex)`, say, the result is `GEPInst` which + // is of type `Int64*`. + Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(M.getContext())), + KIndex, LIndex}; + auto *GEPInst = GetElementPtrInst::CreateInBounds( + LDSOffsetTable->getValueType(), LDSOffsetTable, Indices, + Twine("dssv.gep1.") + Twine(Suffix), const_cast(&EI)); + + // Insert LOAD instruction which loads `offset` value from LDS offset table. + auto *LInst = new LoadInst(GEPInst->getType()->getPointerElementType(), + GEPInst, Twine("dssv.load.") + Twine(Suffix), + const_cast(&EI)); + + // Get the base pointer of new LDS layout. + auto *BasePtr = F->getArg(F->arg_size() - 2); + + // Insert GEP instruction which access the address `BasePtr + LInst`, say, the + // result is `GEPInst2` which if of type 'char*`. + Instruction *GEPInst2 = GetElementPtrInst::CreateInBounds( + BasePtr->getType()->getPointerElementType(), BasePtr, LInst, + Twine("dssv.gep2.") + Twine(Suffix), const_cast(&EI)); + + // Insert type-cast instruction just after above inserted GEP instruction + // which type-casts GEP instruction from `char*` to `ldstype*` where `ldstype` + // is the type of original LDS global. + Instruction *CastInst = new BitCastInst(GEPInst2, LDS->getType(), + Twine("dssv.cast.") + Twine(Suffix), + const_cast(&EI)); + + return CastInst; +} + +static void updateFunctionAssociatedWithLDS( + Module &M, GlobalVariable *LDS, + ValueMap &LDSToFunction, + std::map &LDSToID, + GlobalVariable *LDSOffsetTable) { + // Get end-callee associated with `LDS`. + auto *F = LDSToFunction[LDS]; + + // Insert necessary instruction(s) within `F` in order to access new LDS + // layout. + auto *LDSAccessInst = insertInstructionsToAccessNewLDSLayout( + M, LDS, F, LDSToID, LDSOffsetTable); + + // Finally update all the users of original LDS to refer to new LDS layout via + // above inserted LDS access instruction. + updateAllUsersOfOriginalLDS(M, LDS, F, LDSAccessInst); +} + +static void +eraseOldCallees(ValueMap &OldCalleeToNewCallee) { + // TODO: May be we can come-up with a more efficient implmentation to erase + // old callees from the module. It depends on how many callees that we land-up + // erasing in a real world hip application. May be not many, hence, as of now, + // we have just employed a simplest method of repeatedly visiting the old + // callees and removing each of them, once their number of uses become 0. + bool Loopover = true; + while (Loopover) { + Loopover = false; + for (auto OI = OldCalleeToNewCallee.begin(), + OE = OldCalleeToNewCallee.end(); + OI != OE; ++OI) { + auto *OldCallee = OI->first; + if (OldCallee->getNumUses()) + continue; + OldCalleeToNewCallee.erase(OI); + OldCallee->eraseFromParent(); + Loopover = true; + } + } +} + +static void +replaceNonCallInsts(ValueMap &OldCalleeToNewCallee, + SetVector &NonCallInsts) { + for (auto *I : NonCallInsts) { + Instruction *NewI = I->clone(); + Type *NewTy = nullptr; + + for (auto OI = OldCalleeToNewCallee.begin(), + OE = OldCalleeToNewCallee.end(); + OI != OE; ++OI) { + auto *OldCallee = OI->first; + auto *NewCallee = OI->second; + unsigned Ind = 0; + for (Use &UU : NewI->operands()) { + Value *V = UU.get(); + if (V == OldCallee) { + NewI->setOperand(Ind, NewCallee); + if (!NewTy) // TODO: Can we fix this horrible hack? + NewTy = NewCallee->getType(); + } + ++Ind; + } + } + + NewI->insertBefore(I); + NewI->copyMetadata(*I); + I->replaceAllUsesWith(NewI); + I->eraseFromParent(); + NewI->mutateType(NewTy); + } +} + +static void +updateNonCallInsts(ValueMap &OldCalleeToNewCallee, + SetVector &NonCallInsts) { + replaceNonCallInsts(OldCalleeToNewCallee, NonCallInsts); +} + +static void +eraseOldCallSites(std::map &OldToNewCallSite) { + for (auto KI = OldToNewCallSite.begin(), KE = OldToNewCallSite.end(); + KI != KE; ++KI) { + auto *OldCI = KI->first; + auto *NewCI = KI->second; + OldCI->replaceAllUsesWith(NewCI); + OldCI->eraseFromParent(); + } +} + +static void getNewIndirectCallTargets( + Module &M, CallInst *CI, + ValueMap &OldCalleeToNewCallee, + std::vector &NewIndirectCallTargets) { +#ifndef NDEBUG + assert(CI->isIndirectCall() && "Indirect call expected\n"); +#endif + + auto *MD = CI->getMetadata(LLVMContext::MD_callees); +#ifndef NDEBUG + assert(MD && "Metadata about indirect call targets expected\n"); +#endif + + for (auto &Op : MD->operands()) { + auto *OldCallee = mdconst::extract_or_null(Op); + auto OI = OldCalleeToNewCallee.find(OldCallee); +#ifndef NDEBUG + assert(OI != OldCalleeToNewCallee.end() && "Old callee is expected\n"); +#endif + NewIndirectCallTargets.push_back(OI->second); + } +} + +static void copyDataFromOldToNewCallSite(CallInst *OldCI, CallInst *NewCI) { + // TODO: Why copyMetadata() not copying meta data. I see metadat associated + // with CI, but it is not copied to NewCI. CI->hasMetadata() is false, why? + NewCI->copyMetadata(*OldCI); + NewCI->setTailCall(OldCI->isTailCall()); + NewCI->setCallingConv(OldCI->getCallingConv()); +} + +static FunctionType *getClonedFunctionType(Module &M, Function *Callee) { + // Create a new function type by adding new parameters to the end of existing + // parameter list. + auto *BasePtrTy = Type::getInt8PtrTy(M.getContext(), AMDGPUAS::LOCAL_ADDRESS); + auto *KernNumTy = Type::getInt64Ty(M.getContext()); + + SmallVector NewParams; + auto *FnTy = Callee->getFunctionType(); + for (auto PI = FnTy->param_begin(), PE = FnTy->param_end(); PI != PE; ++PI) + NewParams.push_back(*PI); + + NewParams.push_back(BasePtrTy); + NewParams.push_back(KernNumTy); + + auto *NewFnTy = + FunctionType::get(FnTy->getReturnType(), NewParams, FnTy->isVarArg()); + + return NewFnTy; +} + +static void +getNewArgumentList(Module &M, std::map &KernelToID, + std::map &KernelToBasePtrInst, + CallInst *CI, SmallVectorImpl &NewArgs) { + Value *BasePtrArg = nullptr; + Value *KernNumArg = nullptr; + + auto *Caller = CI->getParent()->getParent(); + if (Caller->getCallingConv() == CallingConv::AMDGPU_KERNEL) { + BasePtrArg = KernelToBasePtrInst[Caller]; + auto KernelNum = KernelToID[Caller]; + KernNumArg = Constant::getIntegerValue(Type::getInt64Ty(M.getContext()), + APInt(64, KernelNum)); + } else { + auto ArgSize = Caller->arg_size(); + BasePtrArg = Caller->getArg(ArgSize - 2); + KernNumArg = Caller->getArg(ArgSize - 1); + } + + for (auto AI = CI->arg_begin(), AE = CI->arg_end(); AI != AE; ++AI) + NewArgs.push_back(*AI); + NewArgs.push_back(BasePtrArg); + NewArgs.push_back(KernNumArg); +} + +static void replaceIndirectCallSites( + Module &M, std::map &KernelToID, + std::map &KernelToBasePtrInst, + ValueMap &OldCalleeToNewCallee, + ValueMap &IndirectCallSiteToCallee) { + MDBuilder MDB(M.getContext()); + std::map OldToNewCallSite; + for (auto SI = IndirectCallSiteToCallee.begin(), + SE = IndirectCallSiteToCallee.end(); + SI != SE; ++SI) { + auto *CI = SI->first; + auto *Callee = SI->second; + + // Get new argument list which can be used to insert new call instruction. + SmallVector NewArgs; + getNewArgumentList(M, KernelToID, KernelToBasePtrInst, CI, NewArgs); + + // Get required function type. + auto *NewFnTy = getClonedFunctionType(M, Callee); + + // Insert new call instruction `NewCI` just before the existing call + // instruction `CI`. + auto *NewCI = CallInst::Create(NewFnTy, CI->getCalledOperand(), NewArgs, + Twine(""), CI); + + copyDataFromOldToNewCallSite(CI, NewCI); + + // Create `!callee` metadata which targets newly cloned functions and set it + // to `NewCI`. + std::vector NewIndirectCallTargets; + getNewIndirectCallTargets(M, CI, OldCalleeToNewCallee, + NewIndirectCallTargets); + auto *NewCallees = MDB.createCallees(NewIndirectCallTargets); + NewCI->setMetadata(LLVMContext::MD_callees, NewCallees); + + OldToNewCallSite[CI] = NewCI; + } + + eraseOldCallSites(OldToNewCallSite); +} + +static bool isCallerNewClonedFunction( + Function *F, ValueMap &NewCalleeToOldCallee) { + if (NewCalleeToOldCallee.find(F) != NewCalleeToOldCallee.end()) + return true; + + return false; +} + +static bool +isApplicableInst(Instruction *I, + ValueMap &NewCalleeToOldCallee) { + // We are only interested in the inctructions within kernel or within new + // cloned functions. +#ifndef NDEBUG + assert(I && "Valid instruction expected\n"); +#endif + auto *Caller = I->getParent()->getParent(); + if (Caller->getCallingConv() == CallingConv::AMDGPU_KERNEL || + isCallerNewClonedFunction(Caller, NewCalleeToOldCallee)) + return true; + + return false; +} + +static void insertToIndirectCallSiteSet( + CallInst *CI, ValueMap &OldCalleeToNewCallee, + ValueMap &NewCalleeToOldCallee, + ValueMap &IndirectCallSiteToCallee) { + auto *MD = CI->getMetadata(LLVMContext::MD_callees); + if (!MD) + return; + + // TODO: Further clean-up the logic? + for (auto &Op : MD->operands()) { + auto *Callee = mdconst::extract_or_null(Op); + if (Callee && !Callee->isDeclaration() && + OldCalleeToNewCallee.find(Callee) != OldCalleeToNewCallee.end() && + isApplicableInst(CI, NewCalleeToOldCallee) && + IndirectCallSiteToCallee.find(CI) == IndirectCallSiteToCallee.end()) { + IndirectCallSiteToCallee[CI] = Callee; + return; + } + } +} + +static void collectIndirectCallSites( + Module &M, ValueMap &OldCalleeToNewCallee, + ValueMap &NewCalleeToOldCallee, + SetVector &NonCallInsts, + ValueMap &IndirectCallSiteToCallee) { + SmallVector ValueStack; + SetVector Visited; + + for (auto *I : NonCallInsts) + ValueStack.push_back(I); + + while (!ValueStack.empty()) { + auto *V = ValueStack.pop_back_val(); + + if (!Visited.insert(V)) + continue; + + auto *CI = dyn_cast(V); + if (CI && CI->isIndirectCall()) { + insertToIndirectCallSiteSet(CI, OldCalleeToNewCallee, + NewCalleeToOldCallee, + IndirectCallSiteToCallee); + continue; + } + + for (auto *U : V->users()) + ValueStack.push_back(U); + } +} + +static void updateIndirectCallSites( + Module &M, std::map &KernelToID, + std::map &KernelToBasePtrInst, + ValueMap &OldCalleeToNewCallee, + ValueMap &NewCalleeToOldCallee, + SetVector &NonCallInsts) { + ValueMap IndirectCallSiteToCallee; + collectIndirectCallSites(M, OldCalleeToNewCallee, NewCalleeToOldCallee, + NonCallInsts, IndirectCallSiteToCallee); + replaceIndirectCallSites(M, KernelToID, KernelToBasePtrInst, + OldCalleeToNewCallee, IndirectCallSiteToCallee); +} + +static void +replaceDirectCallSites(Module &M, std::map &KernelToID, + std::map &KernelToBasePtrInst, + Function *NewCallee, + SetVector &DirectCallSites) { + std::map OldToNewCallSite; + for (auto *CI : DirectCallSites) { + // Get new argument list which can be used to insert new call instruction. + SmallVector NewArgs; + getNewArgumentList(M, KernelToID, KernelToBasePtrInst, CI, NewArgs); + + // Insert new call instruction `NewCI` just before the existing call + // instruction `CI`. + auto *NewCI = CallInst::Create(NewCallee->getFunctionType(), NewCallee, + NewArgs, Twine(""), CI); + + copyDataFromOldToNewCallSite(CI, NewCI); + + OldToNewCallSite[CI] = NewCI; + } + + eraseOldCallSites(OldToNewCallSite); +} + +static void +collectDirectCallSites(Function *OldCallee, + ValueMap &NewCalleeToOldCallee, + SetVector &DirectCallSites) { + for (auto *U : OldCallee->users()) { + auto *CI = dyn_cast(U); + if (CI && isApplicableInst(CI, NewCalleeToOldCallee)) + DirectCallSites.insert(CI); + } +} + +static void +updateDirectCallSites(Module &M, std::map &KernelToID, + std::map &KernelToBasePtrInst, + ValueMap &OldCalleeToNewCallee, + ValueMap &NewCalleeToOldCallee) { + // Update all direct call sites of all callees. + for (auto OI = OldCalleeToNewCallee.begin(), OE = OldCalleeToNewCallee.end(); + OI != OE; ++OI) { + auto *OldCallee = OI->first; + auto *NewCallee = OI->second; + SetVector DirectCallSites; + collectDirectCallSites(OldCallee, NewCalleeToOldCallee, DirectCallSites); + replaceDirectCallSites(M, KernelToID, KernelToBasePtrInst, NewCallee, + DirectCallSites); + } +} + +static void +collectNonCallInsts(Function *OldCallee, + ValueMap &NewCalleeToOldCallee, + SetVector &NonCallInsts) { + for (auto *U : OldCallee->users()) { + auto *CI = dyn_cast(U); + if (!CI && isApplicableInst(dyn_cast(U), NewCalleeToOldCallee)) + NonCallInsts.insert(dyn_cast(U)); + } +} + +static void +updateCalleeUsers(Module &M, std::map &KernelToID, + std::map &KernelToBasePtrInst, + ValueMap &OldCalleeToNewCallee) { + // Create mapping from new callees to old callees. + ValueMap NewCalleeToOldCallee; + for (auto OI = OldCalleeToNewCallee.begin(), OE = OldCalleeToNewCallee.end(); + OI != OE; ++OI) + NewCalleeToOldCallee[OI->second] = OI->first; + + // Collect all non-call users of all callees, usually associated with indirect + // calls. + SetVector NonCallInsts; + for (auto OI = OldCalleeToNewCallee.begin(), OE = OldCalleeToNewCallee.end(); + OI != OE; ++OI) + collectNonCallInsts(OI->first, NewCalleeToOldCallee, NonCallInsts); + + // Update all direct call sites of all callees. + updateDirectCallSites(M, KernelToID, KernelToBasePtrInst, + OldCalleeToNewCallee, NewCalleeToOldCallee); + + // Update all indirect call sites of all callees. + updateIndirectCallSites(M, KernelToID, KernelToBasePtrInst, + OldCalleeToNewCallee, NewCalleeToOldCallee, + NonCallInsts); + + // Update all non-call user instructions which refer to old callees like + // address taken instructions for indirect calls. + updateNonCallInsts(OldCalleeToNewCallee, NonCallInsts); +} + +static void updateKernelToCalleeList( + ValueMap> &KernelToCallee, + ValueMap &OldCalleeToNewCallee) { + for (auto KI = KernelToCallee.begin(), KE = KernelToCallee.end(); KI != KE; + ++KI) { + auto *K = KI->first; + auto OldCalleeList = KI->second; + std::set NewCalleeList; + for (auto *OldCallee : OldCalleeList) { + if (OldCalleeToNewCallee.find(OldCallee) != OldCalleeToNewCallee.end()) + NewCalleeList.insert(OldCalleeToNewCallee[OldCallee]); + else + NewCalleeList.insert(OldCallee); + } + KernelToCallee[K] = NewCalleeList; + } +} + +static void +updateLDSToFunctionMap(ValueMap &LDSToFunction, + ValueMap &OldCalleeToNewCallee) { + for (auto LI = LDSToFunction.begin(), LE = LDSToFunction.end(); LI != LE; + ++LI) { + auto *LDS = LI->first; + auto *OldF = LI->second; + if (OldCalleeToNewCallee.find(OldF) != OldCalleeToNewCallee.end()) + LDSToFunction[LDS] = OldCalleeToNewCallee[OldF]; + } +} + +static Function *cloneCallee(Module &M, Function *Callee) { + // Create a new function type of to be cloned function. + auto *NewFnTy = getClonedFunctionType(M, Callee); + + // Create a copy of the `Callee`, but with new function type. + auto *NewCallee = + Function::Create(NewFnTy, Callee->getLinkage(), Callee->getAddressSpace(), + Callee->getName() + Twine(".c")); + + ValueToValueMapTy VMap; + auto *NewCalleeArgIt = NewCallee->arg_begin(); + for (auto &Arg : Callee->args()) { + auto ArgName = Arg.getName(); + NewCalleeArgIt->setName(ArgName); + VMap[&Arg] = &(*NewCalleeArgIt++); + } + + // TODO: ModuleLevelChanges should be set to true or false? + SmallVector Returns; + CloneFunctionInto(NewCallee, Callee, VMap, /*ModuleLevelChanges=*/false, + Returns); + + // Copy all metadata + SmallVector, 1> MDs; + Callee->getAllMetadata(MDs); + for (auto MDIt : MDs) + NewCallee->addMetadata(MDIt.first, *MDIt.second); + + // Insert `NewCallee` just before `Callee` within the module. + M.getFunctionList().insert(Callee->getIterator(), NewCallee); + + // Swap names so that new function retains original name. + auto NewName = NewCallee->getName(); + NewCallee->takeName(Callee); + Callee->setName(NewName); + + return NewCallee; +} + +static void traverseCGPathsAndCloneCallees( + Module &M, ValueMap &OldCalleeToNewCallee, + SmallVectorImpl> &CGPaths) { + // 1. Traverse all the call graph paths from kernels to callees. + // 2. For each device function encoutered while traversing, create a clone of + // it, by adding parameters to it's parameter list, but also retain the + // original device function for the moment. + for (auto CGPath : CGPaths) { + // TODO: We can in-fact assert that length of `CGPath` is atleast two. But, + // we are okay for now. + if (CGPath.size() < 2) + continue; + + // We are interested in cloning of only device functions in the call graph + // path, hence we start from second node in the call graph path. + auto PI = CGPath.begin() + 1; + auto PE = CGPath.end(); + for (; PI != PE; ++PI) { + auto *CurCallee = *PI; +#ifndef NDEBUG + assert(CurCallee && "Valid function expected\n"); +#endif + + if (OldCalleeToNewCallee.find(CurCallee) != OldCalleeToNewCallee.end()) + continue; + + // The `CurCallee` is encountered first time, create a clone of it. + auto *NewCallee = cloneCallee(M, CurCallee); + OldCalleeToNewCallee[CurCallee] = NewCallee; + } + } +} + +static void +pushCallGraphPaths(CallGraphNode *CGNode, SetVector &TopCGPath, + SmallVectorImpl> &CGPathStack, + std::map &CGNodeToCallInst) { + for (auto NI = CGNode->begin(), NE = CGNode->end(); NI != NE; ++NI) { + auto *CI = dyn_cast(NI->first.getValue()); + auto *CGN = NI->second; +#ifndef NDEBUG + assert(CI && "Call instruction associated with call graph node cannot be" + " null\n"); + assert(CGN && "Call graph node associated with function definition cannot" + " be null\n"); +#endif + SetVector ClonedCGPath(TopCGPath.begin(), TopCGPath.end()); + ClonedCGPath.insert(CGN); + CGPathStack.push_back(ClonedCGPath); + CGNodeToCallInst[CGN] = CI; + } +} + +static void +insertToCGPathSet(CallGraphNode *CGNode, Function *EndCallee, + SetVector &TopCGPath, + SmallVectorImpl> &CGPathStack, + std::map &CGNodeToCallInst, + SmallVectorImpl> &CGPaths, + bool IndirectCallHandling = false) { + auto *Callee = CGNode->getFunction(); + if (Callee && !Callee->isDeclaration()) { + if (Callee == EndCallee) { + SetVector FPath; + for (auto *CGN : TopCGPath) + FPath.insert(CGN->getFunction()); + CGPaths.push_back(FPath); + return; + } + if (IndirectCallHandling) + TopCGPath.insert(CGNode); + pushCallGraphPaths(CGNode, TopCGPath, CGPathStack, CGNodeToCallInst); + } +} + +static void collectCallGraphPathsBetweenKernelAndCallee( + Module &M, Function *K, Function *Callee, + SmallVectorImpl> &CGPaths) { + // Traverse the call graph associated with the kernel in DFS manner and + // collect all the paths from kernel to callee. + // + // TODO: Note that this algorithm will not work if there exist recursive + // calls, and the current assumption here is that the call graph is acyclic. + // We need to visit it back again to handle call graph which could contain + // cycles. + auto CG = CallGraph(M); + auto *KernCGNode = CG[K]; +#ifndef NDEBUG + assert(KernCGNode && "Call graph node associated with kernel definition " + "cannot be null\n"); +#endif + + SetVector CGPath; + CGPath.insert(KernCGNode); + SmallVector, 8> CGPathStack; + std::map CGNodeToCallInst; + + pushCallGraphPaths(KernCGNode, CGPath, CGPathStack, CGNodeToCallInst); + + while (!CGPathStack.empty()) { + auto TopCGPath = CGPathStack.pop_back_val(); + auto *CGNode = TopCGPath.back(); + + if (CGNode->getFunction()) { + // Direct calls + insertToCGPathSet(CGNode, Callee, TopCGPath, CGPathStack, + CGNodeToCallInst, CGPaths); + } else { + // Indirect calls + TopCGPath.remove(CGNode); + auto *CI = CGNodeToCallInst[CGNode]; + if (auto *MD = CI->getMetadata(LLVMContext::MD_callees)) { + auto CloneOfTopCGPath = TopCGPath; + for (auto &Op : MD->operands()) + insertToCGPathSet(CG[mdconst::extract_or_null(Op)], Callee, + CloneOfTopCGPath, CGPathStack, CGNodeToCallInst, + CGPaths, true); + } + } + } +} + +static void createCloneOfCalleesWithNewParams( + Module &M, ValueMap> &KernelToCallee, + ValueMap &OldCalleeToNewCallee) { + // Construct clones of all callees which accept two additional new parameters: + // One is LDS layout base pointer which is of type `char*`, and other one is + // kernel number which is of type `Int64`. + // + // Collect all call graph paths between kernels and callees. + SmallVector, 8> CGPaths; + for (auto KI = KernelToCallee.begin(), KE = KernelToCallee.end(); KI != KE; + ++KI) + for (auto *Callee : KI->second) + collectCallGraphPathsBetweenKernelAndCallee(M, KI->first, Callee, + CGPaths); + + // Traverse all call graph paths from kernels to callees, and create clone of + // all callees along the paths. + traverseCGPathsAndCloneCallees(M, OldCalleeToNewCallee, CGPaths); +} + +static void +processCallees(Module &M, ValueMap &LDSToFunction, + ValueMap> &KernelToCallee, + std::map &KernelToID, + std::map &KernelToBasePtrInst) { + // Create clones of callees which accept new parameters. + ValueMap OldCalleeToNewCallee; + createCloneOfCalleesWithNewParams(M, KernelToCallee, OldCalleeToNewCallee); + + // Update all the required data structures to point to new cloned functions in + // place of their old counterparts. + updateLDSToFunctionMap(LDSToFunction, OldCalleeToNewCallee); + updateKernelToCalleeList(KernelToCallee, OldCalleeToNewCallee); + + // Appropriately update all users of all old callees to refer to corresponding + // new callees. + updateCalleeUsers(M, KernelToID, KernelToBasePtrInst, OldCalleeToNewCallee); + + // By now, all old functions are dead without any reference being made to + // them, erase them now from the module. + eraseOldCallees(OldCalleeToNewCallee); + +#ifndef NDEBUG + assert(OldCalleeToNewCallee.empty() && + "All the old callees should have dead by now\n"); +#endif +} + +static Instruction * +insertBasePointerAccessInstructionWithinKernel(Module &M, Function *K, + GlobalVariable *NewLDS) { + // Suffix the names of the instructions with unique integer values. + static int Suffix = 0; + ++Suffix; + + // Get the first insert point of the entry basic block of current kernel. + auto BI = K->getEntryBlock().getFirstInsertionPt(); +#ifndef NDEBUG + assert(BI != K->getEntryBlock().end() && + "Entry basic block of the kernel cannot be empty, otherwise control " + "would not reach this point\n"); +#endif + auto &EI = *BI; + + // Insert GEP instruction which access the address `NewLDS + 0`, say, the + // result is `GEPInst` which is of type `char*`. + Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(M.getContext())), + Constant::getIntegerValue( + Type::getInt64Ty(M.getContext()), APInt(32, 0))}; + + Instruction *GEPInst = GetElementPtrInst::CreateInBounds( + NewLDS->getValueType(), const_cast(NewLDS), Indices, + Twine("dssv.gep.") + Twine(Suffix), const_cast(&EI)); + + return GEPInst; +} + +static GlobalVariable *constructLDSOffsetTable( + Module &M, SetVector &Kernels, + SetVector &LDSGlobals, + std::map> + &KernelToLDSOffset, + std::map &KernelToID, + std::map &LDSToID, + std::map &IDToKernel, + std::map &IDToLDS) { + // Get type of LDS offset table. + auto *EleTy = Type::getInt64Ty(M.getContext()); + auto *Arr1DTy = ArrayType::get(EleTy, LDSGlobals.size()); + auto *Arr2DTy = ArrayType::get(Arr1DTy, Kernels.size()); + + // Create offset initialization list. + SmallVector Init2DValues; + for (unsigned K = 0; K < Kernels.size(); ++K) { + auto *Kernel = IDToKernel[K]; +#ifndef NDEBUG + assert(KernelToLDSOffset.find(Kernel) != KernelToLDSOffset.end() && + "Expected LDS offset list\n"); +#endif + auto &LDSToOffset = KernelToLDSOffset[Kernel]; + SmallVector Init1DValues; + for (unsigned L = 0; L < LDSGlobals.size(); ++L) { + auto *LDS = IDToLDS[L]; + auto Offset = + LDSToOffset.find(LDS) != LDSToOffset.end() ? LDSToOffset[LDS] : -1; + auto *C = Constant::getIntegerValue(EleTy, APInt(64, Offset)); + Init1DValues.push_back(C); + } + auto *Const1D = ConstantArray::get(Arr1DTy, Init1DValues); + Init2DValues.push_back(Const1D); + } + auto *Const2D = ConstantArray::get(Arr2DTy, Init2DValues); + + // Create LDS offset table with offset initialization. + auto *LDSOffsetTable = new GlobalVariable( + M, Arr2DTy, false, GlobalValue::InternalLinkage, Const2D, + Twine("__LDSGlobalsOffsetTable__"), nullptr, + GlobalVariable::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS); + + // Set proper alignment. + LDSOffsetTable->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + LDSOffsetTable->setAlignment( + MaybeAlign(M.getDataLayout().getPreferredAlign(LDSOffsetTable))); + + return LDSOffsetTable; +} + +static GlobalVariable * +createSingleContiguousLayout(Module &M, Function *K, + uint64_t &TotalLDSSizeInBytes) { + // Create a new LDS global which is nothing but a single contigeous shared + // memory layout representing all the LDS globals which are defined within + // callees, and the size of this new contigeous LDS global layout is equal to + // the sum of the sizes of all these associated LDS globals. + auto *NewLDSTy = + ArrayType::get(IntegerType::get(M.getContext(), 8), TotalLDSSizeInBytes); + auto *NewLDS = new GlobalVariable( + M, NewLDSTy, false, GlobalValue::InternalLinkage, + UndefValue::get(NewLDSTy), + Twine(K->getName()) + Twine(".Unified.Device.Scope.LDS.Layout"), nullptr, + GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); + NewLDS->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + NewLDS->setAlignment(MaybeAlign(M.getDataLayout().getPreferredAlign(NewLDS))); + + return NewLDS; +} + +static void +computeTotalLDSSizeInBytes(ValueMap &LDSToSize, + SetVector &KernelLDSList, + std::map &LDSToOffset, + uint64_t &TotalLDSSizeInBytes) { + // For the current kernel, compute the total size of all LDS globals, and also + // offsets associated with them within in the new LDS global. + TotalLDSSizeInBytes = 0; + for (auto *LDS : KernelLDSList) { + LDSToOffset[LDS] = TotalLDSSizeInBytes; + TotalLDSSizeInBytes += LDSToSize[LDS]; + } +} + +static void constructKernelSpecificLDSLayouts( + Module &M, SetVector &Kernels, + SetVector &LDSGlobals, + ValueMap> &KernelToLDS, + ValueMap &LDSToSize, + std::map &KernelToID, + std::map> + &KernelToLDSOffset, + ValueMap &KernelToNewLDSLayout) { + // Traverse through each kernel, and construct corresponding new LDS layout. + for (auto *K : Kernels) { + // Get LDS list for current kernel. + SetVector &KernelLDSList = KernelToLDS[K]; +#ifndef NDEBUG + assert(!KernelLDSList.empty() && "Expected non-empty LDS set\n"); +#endif + + // Compute total aggregate size of all LDS globals associated with current + // kernel along with their offsets within new LDS layout. + uint64_t TotalLDSSizeInBytes; + std::map LDSToOffset; + computeTotalLDSSizeInBytes(LDSToSize, KernelLDSList, LDSToOffset, + TotalLDSSizeInBytes); + + // Create a single contigeous LDS layout for current kernel. + auto *NewLDS = createSingleContiguousLayout(M, K, TotalLDSSizeInBytes); + + // Save offsets and new LDS layout. + KernelToLDSOffset[K] = LDSToOffset; + KernelToNewLDSLayout[K] = NewLDS; + } +} + +static void MapKernelsAndLDSGlobalsToUniqueId( + SetVector &Kernels, SetVector &LDSGlobals, + std::map &KernelToID, + std::map &LDSToID, + std::map &IDToKernel, + std::map &IDToLDS) { + // Map each kernel to unique integer and vice versa. + uint64_t KNum = 0; + for (auto *K : Kernels) { + KernelToID[K] = KNum; + IDToKernel[KNum] = K; + ++KNum; + } + + // Map each LDS to unique integer and vice versa. + uint64_t LNum = 0; + for (auto *LDSGlobal : LDSGlobals) { + LDSToID[LDSGlobal] = LNum; + IDToLDS[LNum] = LDSGlobal; + ++LNum; + } +} + +static void processDeviceScopeSharedVariables( + Module &M, SetVector &Kernels, + SetVector &LDSGlobals, + ValueMap &LDSToFunction, + ValueMap> &KernelToCallee, + ValueMap> &KernelToLDS, + ValueMap &LDSToSize) { + // Map each kernel and LDS global to unique integer and vice versa. + std::map KernelToID; + std::map LDSToID; + std::map IDToKernel; + std::map IDToLDS; + MapKernelsAndLDSGlobalsToUniqueId(Kernels, LDSGlobals, KernelToID, LDSToID, + IDToKernel, IDToLDS); + + // Create a single contigeous LDS global layout for each kernel. + std::map> KernelToLDSOffset; + ValueMap KernelToNewLDSLayout; + constructKernelSpecificLDSLayouts(M, Kernels, LDSGlobals, KernelToLDS, + LDSToSize, KernelToID, KernelToLDSOffset, + KernelToNewLDSLayout); + + // Create 2D LDS offset table which will be referred at runtime to access the + // kernel specific LDS layout offset value which correspond to original device + // scope LDS. + auto *LDSOffsetTable = + constructLDSOffsetTable(M, Kernels, LDSGlobals, KernelToLDSOffset, + KernelToID, LDSToID, IDToKernel, IDToLDS); + + // Insert GEP instruction - `LDS_LAYOUT_START_ADDR + 0` within each kernel + // which will be of type `char*`. + std::map KernelToBasePtrInst; + for (auto *K : Kernels) + KernelToBasePtrInst[K] = insertBasePointerAccessInstructionWithinKernel( + M, K, KernelToNewLDSLayout[K]); + + // Create clones of callees to accept new parameters and accordingly update + // corresponding call sites. + processCallees(M, LDSToFunction, KernelToCallee, KernelToID, + KernelToBasePtrInst); + + // Update end-callees where the LDS globals was originally defined so that all + // the references to original LDS globals within end-callees are appropriately + // replaced. And finally erase original LDS globals from the module. + for (auto *LDS : LDSGlobals) { + updateFunctionAssociatedWithLDS(M, LDS, LDSToFunction, LDSToID, + LDSOffsetTable); + LDS->eraseFromParent(); + } +} + +static void +getLDSGlobalSizeInBytes(Module &M, GlobalVariable *LDSGlobal, + ValueMap &LDSToSize) { + LDSToSize[LDSGlobal] = getTypeStoreSizeInBytes(M, LDSGlobal->getValueType()); +} + +static void +filterKernels(SetVector &Kernels, + ValueMap> &KernelToLDS) { + // Collect all the kernels which can be removed. + SetVector ToBeRemovedKernels; + for (auto *K : Kernels) + if (KernelToLDS.find(K) == KernelToLDS.end()) + ToBeRemovedKernels.insert(K); + + // Remove all those kernels which do not have any device scope variables + // associated with them. + for (auto *K : ToBeRemovedKernels) + Kernels.remove(K); +} + +static void pairUpKernelWithLDSList( + Function *K, ValueMap> &KernelToCallee, + ValueMap> &FunctionToLDS, + ValueMap> &KernelToLDS) { + // Collect all the LDS globals defined within the end-callees associated with + // the current kernel. + SetVector LDSSet; + auto Callees = KernelToCallee[K]; + for (auto *Callee : Callees) { + if (FunctionToLDS.find(Callee) == FunctionToLDS.end()) + continue; + SetVector CalleeLDSSet = FunctionToLDS[Callee]; + for (auto *CalleeLDS : CalleeLDSSet) + LDSSet.insert(CalleeLDS); + } + if (!LDSSet.empty()) + KernelToLDS[K] = LDSSet; +} + +static void filterDeviceScopeLDSGlobals( + SetVector &LDSGlobals, + ValueMap &LDSToFunction) { + // Filter out all LDS globals which are defined within kernels since we don`t + // need to handle them. + SetVector ToBeRemovedLDSList; + for (auto *LDS : LDSGlobals) + if (LDSToFunction.find(LDS) == LDSToFunction.end()) + ToBeRemovedLDSList.insert(LDS); + for (auto *LDS : ToBeRemovedLDSList) + LDSGlobals.remove(LDS); +} + +static void filterDeviceFunctions( + SetVector &LDSGlobals, + ValueMap &LDSToFunction, + ValueMap> &FunctionToLDS, + ValueMap> &KernelToCallee) { + // There might exist device functions with LDS globals defined within them, + // but without a call graph path from any of the kernels. Filter out such + // device functions and associated LDS globals. + // + // Collect all actives end-callees. + std::set ActiveEndCallees; + for (auto KI = KernelToCallee.begin(), KE = KernelToCallee.end(); KI != KE; + ++KI) { + auto &CalleeSet = KI->second; + for (auto *Callee : CalleeSet) + ActiveEndCallees.insert(Callee); + } + + // Filter `FunctionToLDS` data structure. + std::set ToBeRemovedFunctions; + for (auto FI = FunctionToLDS.begin(), FE = FunctionToLDS.end(); FI != FE; + ++FI) + if (ActiveEndCallees.find(FI->first) == ActiveEndCallees.end()) + ToBeRemovedFunctions.insert(FI->first); + for (auto *F : ToBeRemovedFunctions) + FunctionToLDS.erase(F); + + // Filter `LDSToFunction` data structure. + std::set ToBeRemovedLDSGlobals; + for (auto LI = LDSToFunction.begin(), LE = LDSToFunction.end(); LI != LE; + ++LI) + if (FunctionToLDS.find(LI->second) == FunctionToLDS.end()) + ToBeRemovedLDSGlobals.insert(LI->first); + for (auto *LDS : ToBeRemovedLDSGlobals) + LDSToFunction.erase(LDS); + + // Filter `LDSGlobals` data structure. + filterDeviceScopeLDSGlobals(LDSGlobals, LDSToFunction); +} + +static void +pushCallGraphNodes(CallGraphNode *CGNode, + SmallVectorImpl &CGNodeStack, + std::map &CGNodeToCallInst) { +#ifndef NDEBUG + assert(CGNode && "Call graph node associated with function definition cannot" + " be null\n"); +#endif + + for (auto GI = CGNode->begin(), GE = CGNode->end(); GI != GE; ++GI) { + auto *CI = dyn_cast(GI->first.getValue()); + auto *CGN = GI->second; +#ifndef NDEBUG + assert(CI && "Call instruction associated with call graph node cannot be" + " null\n"); + assert(CGN && "Call graph node associated with function definition cannot" + " be null\n"); +#endif + CGNodeStack.push_back(CGN); + CGNodeToCallInst[CGN] = CI; + } +} + +static void insertToCalleeSet( + CallGraphNode *CGNode, + ValueMap> &FunctionToLDS, + SmallVectorImpl &CGNodeStack, + std::map &CGNodeToCallInst, + std::set &CalleeSet) { + auto *Callee = CGNode->getFunction(); + if (Callee && !Callee->isDeclaration()) { + if (FunctionToLDS.find(Callee) != FunctionToLDS.end()) + CalleeSet.insert(Callee); + pushCallGraphNodes(CGNode, CGNodeStack, CGNodeToCallInst); + } +} + +static void pairUpKernelWithCalleeList( + Module &M, Function *K, + ValueMap> &FunctionToLDS, + ValueMap> &KernelToCallee) { + // Get the call graph node associated with current kernel, traverse the call + // graph associated with it in DFS manner and collect all the associated + // callees which define LDS global(s). + auto CG = CallGraph(M); + auto *KernCGNode = CG[K]; + SmallVector CGNodeStack; + SetVector Visited; + std::map CGNodeToCallInst; + std::set CalleeSet; + + pushCallGraphNodes(KernCGNode, CGNodeStack, CGNodeToCallInst); + + while (!CGNodeStack.empty()) { + auto *CGNode = CGNodeStack.pop_back_val(); + if (!Visited.insert(CGNode)) + continue; + + if (CGNode->getFunction()) { + // Direct calls + insertToCalleeSet(CGNode, FunctionToLDS, CGNodeStack, CGNodeToCallInst, + CalleeSet); + } else { + // Indirect calls + auto *CI = CGNodeToCallInst[CGNode]; + if (auto *MD = CI->getMetadata(LLVMContext::MD_callees)) { + for (auto &Op : MD->operands()) + insertToCalleeSet(CG[mdconst::extract_or_null(Op)], + FunctionToLDS, CGNodeStack, CGNodeToCallInst, + CalleeSet); + } + + // Indirect call does not bind to any particular function, and the + // `CallGraphNode` is same for all the function pointers which have same + // signature. + Visited.remove(CGNode); + } + } + + KernelToCallee[K] = CalleeSet; +} + +static void createFunctionToLDSMap( + ValueMap &LDSToFunction, + ValueMap> &FunctionToLDS) { + for (auto LI = LDSToFunction.begin(), LE = LDSToFunction.end(); LI != LE; + ++LI) { + auto *LDSGlobal = LI->first; + auto *F = LI->second; + auto FI = FunctionToLDS.find(F); + if (FI == FunctionToLDS.end()) { + SetVector LDSSet; + LDSSet.insert(LDSGlobal); + FunctionToLDS[F] = LDSSet; + } else + FunctionToLDS[F].insert(LDSGlobal); + } +} + +static void pairUpLDSGlobalWithItsAssociatedFunction( + GlobalVariable *LDSGlobal, + ValueMap &LDSToFunction) { + // Recursively visit user list of current LDS global and find the function + // within which the `LDSGlobal` is defined, and this function should always be + // successfully found. +#ifndef NDEBUG + assert(!LDSGlobal->user_empty() && + "LDS Global user list cannot be empty since it must have been defined " + "within either kernel or device function"); +#endif + SmallVector UserStack; + SetVector Visited; + + for (auto *U : LDSGlobal->users()) + UserStack.push_back(U); + + while (!UserStack.empty()) { + auto *U = UserStack.pop_back_val(); + if (!Visited.insert(U)) + continue; + + if (auto *I = dyn_cast(U)) { + auto *F = I->getParent()->getParent(); + if (!F) + continue; + + // We are only interested in device scope shared variables. + if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL) + LDSToFunction[LDSGlobal] = F; + + return; + } + + for (auto *UU : U->users()) + UserStack.push_back(UU); + } + + llvm_unreachable("Control is not expected to reach this point"); +} + +static bool +handleDeviceScopeSharedVariables(Module &M, + SetVector &LDSGlobals, + SetVector &Kernels) { + // Pair up each LDS global with the function within which the LDS global is + // defined. + ValueMap LDSToFunction; + for (auto *LDSGlobal : LDSGlobals) + pairUpLDSGlobalWithItsAssociatedFunction(LDSGlobal, LDSToFunction); + + // Filter out all LDS globals which are defined within kernels since we don`t + // need to handle them. + filterDeviceScopeLDSGlobals(LDSGlobals, LDSToFunction); + + // Create reverse map from function to LDS globals. + ValueMap> FunctionToLDS; + createFunctionToLDSMap(LDSToFunction, FunctionToLDS); + + // Pair up kernels with end-callees which define LDS globals and there exist + // call graph paths from kernels to end-callees. + ValueMap> KernelToCallee; + for (auto *K : Kernels) + pairUpKernelWithCalleeList(M, K, FunctionToLDS, KernelToCallee); + + // There might exist device functions with LDS globals defined within them, + // but without a call graph path from any of the kernels. Filter out such + // device functions and associated LDS globals. + filterDeviceFunctions(LDSGlobals, LDSToFunction, FunctionToLDS, + KernelToCallee); + + // Pair up kernels with device scope LDS globals. + ValueMap> KernelToLDS; + for (auto *K : Kernels) + pairUpKernelWithLDSList(K, KernelToCallee, FunctionToLDS, KernelToLDS); + + // Consider only kernels which have device scope shared variables associated + // with them. + filterKernels(Kernels, KernelToLDS); + + // If we do not have any kernel which is associated with device scope shared + // variables, then there is nothing to do, and there is no any module level + // changes to be done, just return `false`. + if (Kernels.empty()) + return false; + + // Get the size of each LDS global in bytes. + ValueMap LDSToSize; + for (auto *LDSGlobal : LDSGlobals) + getLDSGlobalSizeInBytes(M, LDSGlobal, LDSToSize); + + // Perform all the necessary processing required. + processDeviceScopeSharedVariables(M, Kernels, LDSGlobals, LDSToFunction, + KernelToCallee, KernelToLDS, LDSToSize); + + // Module level changes are done, return `true`. + return true; +} + +static bool handleDeviceScopeSharedVariables(Module &M) { + // Collect all the (static) LDS globals defined within the current module. + SetVector LDSGlobals; + for (auto &GV : M.globals()) + if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + !GV.hasExternalLinkage()) + LDSGlobals.insert(&GV); + + if (LDSGlobals.empty()) { + LLVM_DEBUG(dbgs() << "No LDS globals defined in the module " << M.getName() + << ", skipping handling device of scope shared variables" + << "\n"); + return false; + } + + // Collect all the amdgpu kernels defined within the current module. + SetVector Kernels; + for (auto &F : M.functions()) + if ((F.getCallingConv() == CallingConv::AMDGPU_KERNEL) && + !F.isDeclaration()) + Kernels.insert(&F); + + if (Kernels.empty()) { + LLVM_DEBUG(dbgs() << "No kernels defined in the module " << M.getName() + << ", skipping handling of device scope shared variables" + << "\n"); + return false; + } + + return handleDeviceScopeSharedVariables(M, LDSGlobals, Kernels); +} + +bool AMDGPUDeviceScopeSharedVariable::runOnModule(Module &M) { + LLVM_DEBUG(dbgs() << "===== Handling device scope shared variables in the " + "module " + << M.getName() << "\n"); + + // TODO: We only want to handle HIP kernels, and no kernels from from other + // programming languages, like OpenCL, OpenMP, etc. Do we need to add a + // condition here for it, and skip running the pass for non-HIP kernels? + if (skipModule(M)) { + LLVM_DEBUG(dbgs() << "Skipping handling of device scope shared variables " + "in the module " + << M.getName() << "\n"); + return false; + } + + bool Changed = handleDeviceScopeSharedVariables(M); + + LLVM_DEBUG(dbgs() << "===== Done with handling device scope shared variables " + "in the module " + << M.getName() << "\n"); + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -40,6 +40,7 @@ static bool EnableLateStructurizeCFG; static bool EnableFunctionCalls; static bool EnableFixedFunctionABI; + static bool EnableDeviceScopeSharedVariable; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -198,6 +198,12 @@ cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden); +static cl::opt EnableDeviceScopeSharedVariable( + "amdgpu-enable-device-scope-shared-variable", + cl::desc("Support amdgpu device scope shared variables"), + cl::location(AMDGPUTargetMachine::EnableDeviceScopeSharedVariable), + cl::init(false), cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -265,6 +271,7 @@ initializeGCNRegBankReassignPass(*PR); initializeGCNNSAReassignPass(*PR); initializeSIAddIMGInitPass(*PR); + initializeAMDGPUDeviceScopeSharedVariablePass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -394,6 +401,7 @@ bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; bool AMDGPUTargetMachine::EnableFunctionCalls = false; bool AMDGPUTargetMachine::EnableFixedFunctionABI = false; +bool AMDGPUTargetMachine::EnableDeviceScopeSharedVariable = false; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; @@ -718,6 +726,12 @@ disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); + // We expect to run this pass as a first AMDGPU IR pass so that new + // instructions being added in this pass can possibly undergo further + // transformations via subsequent passes. + if (EnableDeviceScopeSharedVariable) + addPass(createAMDGPUDeviceScopeSharedVariablePass()); + addPass(createAMDGPUPrintfRuntimeBinding()); // This must occur before inlining, as the inliner will not look through diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -50,6 +50,7 @@ AMDGPUAtomicOptimizer.cpp AMDGPUCallLowering.cpp AMDGPUCodeGenPrepare.cpp + AMDGPUDeviceScopeSharedVariable.cpp AMDGPUExportClustering.cpp AMDGPUFixFunctionBitcasts.cpp AMDGPUFrameLowering.cpp diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-deep-function-calls.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-deep-function-calls.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-deep-function-calls.ll @@ -0,0 +1,166 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,OFFSET-TABLE,NEW-PARAM,GCN %s + +; OLD-LDS-NOT: @_ZZ22function_four_with_ldsPiS_E17smem_in_func_four +; OLD-LDS: @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two +; OLD-LDS: @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one +; NEW-LDS: @_Z19kernel_two_with_ldsPiS_.Unified.Device.Scope.LDS.Layout +; NEW-LDS: @_Z19kernel_one_with_ldsPiS_.Unified.Device.Scope.LDS.Layout +; OLD-LDS-NOT: @_ZZ22function_four_with_ldsPiS_E17smem_in_func_four +@_ZZ22function_four_with_ldsPiS_E17smem_in_func_four = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16 +@_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16 +@_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16 + +; OFFSET-TABLE: @__LDSGlobalsOffsetTable__ + +; NEW-PARAM: i8 addrspace(3)* %0 +; NEW-PARAM: i64 %1 +define internal fastcc void @_Z22function_four_with_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr { +; GCN-LABEL: entry: +; GCN: %dssv.gep1.1 = getelementptr inbounds [2 x [1 x i64]], [2 x [1 x i64]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 0 +; GCN-NEXT: %dssv.load.1 = load i64, i64 addrspace(4)* %dssv.gep1.1, align 4 +; GCN-NEXT: %dssv.gep2.1 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.1 +; GCN-NEXT: %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep2.1 to [256 x i32] addrspace(3)* +; GCN-NEXT: %2 = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: %idxprom = zext i32 %2 to i64 +; GCN-NEXT: %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom +; GCN-NEXT: %3 = load i32, i32* %arrayidx, align 4 +; GCN-NEXT: %4 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.1, i32 0, i32 %2 +; GCN-NEXT: store i32 %3, i32 addrspace(3)* %4, align 4 +; GCN-NEXT: %add = add nuw nsw i32 %2, 1 +; GCN-NEXT: %5 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.1, i32 0, i32 %add +; GCN-NEXT: %6 = load i32, i32 addrspace(3)* %5, align 4 +; GCN-NEXT: %mul = mul nsw i32 %6, %3 +; GCN-NEXT: %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom +; GCN-NEXT: store i32 %mul, i32* %arrayidx12, align 4 +; GCN-NEXT: ret void +entry: + %0 = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %0 to i64 + %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx31 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ22function_four_with_ldsPiS_E17smem_in_func_four, i32 0, i32 %0 + store i32 %1, i32 addrspace(3)* %arrayidx31, align 4 + %add = add nuw nsw i32 %0, 1 + %arrayidx93 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ22function_four_with_ldsPiS_E17smem_in_func_four, i32 0, i32 %add + %2 = load i32, i32 addrspace(3)* %arrayidx93, align 4 + %mul = mul nsw i32 %2, %1 + %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom + store i32 %mul, i32* %arrayidx12, align 4 + ret void +} + +; NEW-PARAM: i8 addrspace(3)* %0 +; NEW-PARAM: i64 %1 +define internal fastcc void @_Z26function_three_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr { +; GCN-LABEL: entry: +; GCN: tail call fastcc void @_Z22function_four_with_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1) +; GCN-NEXT: ret void +entry: + tail call fastcc void @_Z22function_four_with_ldsPiS_(i32* %i_arg, i32* %o_arg) + ret void +} + +; NEW-PARAM: i8 addrspace(3)* %0 +; NEW-PARAM: i64 %1 +define internal fastcc void @_Z24function_two_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr { +; GCN-LABEL: entry: +; GCN: tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1) +; GCN-NEXT: ret void +entry: + tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg) + ret void +} + +; NEW-PARAM: i8 addrspace(3)* %0 +; NEW-PARAM: i64 %1 +define internal fastcc void @_Z24function_one_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr { +; GCN-LABEL: entry: +; GCN: tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1) +; GCN-NEXT: ret void +entry: + tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg) + ret void +} + +define protected amdgpu_kernel void @_Z19kernel_two_with_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr { +; GCN-LABEL: entry: +; GCN: %dssv.gep.{{[1-2]}} = getelementptr inbounds [1024 x i8], [1024 x i8] addrspace(3)* @_Z19kernel_two_with_ldsPiS_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0 +; GCN-NEXT: %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* +; GCN-NEXT: %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* +; GCN-NEXT: %2 = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: %idxprom = zext i32 %2 to i64 +; GCN-NEXT: %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom +; GCN-NEXT: %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4 +; GCN-NEXT: %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %2 +; GCN-NEXT: store i32 %3, i32 addrspace(3)* %arrayidx57, align 4 +; GCN-NEXT: %add = add nuw nsw i32 %2, 1 +; GCN-NEXT: %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %add +; GCN-NEXT: %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4 +; GCN-NEXT: %mul = mul nsw i32 %4, %3 +; GCN-NEXT: %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom +; GCN-NEXT: store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4 +; GCN-NEXT: tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}}) +; GCN-NEXT: tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}}) +; GCN-NEXT: ret void +entry: + %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* + %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* + %2 = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %2 to i64 + %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom + %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4 + %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %2 + store i32 %3, i32 addrspace(3)* %arrayidx57, align 4 + %add = add nuw nsw i32 %2, 1 + %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %add + %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4 + %mul = mul nsw i32 %4, %3 + %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom + store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4 + tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1) + tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1) + ret void +} + +define protected amdgpu_kernel void @_Z19kernel_one_with_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr { +; GCN-LABEL: entry: +; GCN: %dssv.gep.{{[1-2]}} = getelementptr inbounds [1024 x i8], [1024 x i8] addrspace(3)* @_Z19kernel_one_with_ldsPiS_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0 +; GCN-NEXT: %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* +; GCN-NEXT: %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* +; GCN-NEXT: %2 = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: %idxprom = zext i32 %2 to i64 +; GCN-NEXT: %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom +; GCN-NEXT: %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4 +; GCN-NEXT: %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %2 +; GCN-NEXT: store i32 %3, i32 addrspace(3)* %arrayidx57, align 4 +; GCN-NEXT: %add = add nuw nsw i32 %2, 1 +; GCN-NEXT: %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %add +; GCN-NEXT: %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4 +; GCN-NEXT: %mul = mul nsw i32 %4, %3 +; GCN-NEXT: %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom +; GCN-NEXT: store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4 +; GCN-NEXT: tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}}) +; GCN-NEXT: tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}}) +; GCN-NEXT: ret void +entry: + %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* + %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* + %2 = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %2 to i64 + %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom + %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4 + %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %2 + store i32 %3, i32 addrspace(3)* %arrayidx57, align 4 + %add = add nuw nsw i32 %2, 1 + %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %add + %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4 + %mul = mul nsw i32 %4, %3 + %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom + store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4 + tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1) + tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() + diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-indirect-call.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-indirect-call.ll @@ -0,0 +1,154 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,OFFSET-TABLE,NEW-PARAM,GCN,METADATA %s + +; OLD-LDS-NOT: @_ZZ21function_six_with_ldsPiS_E16smem_in_func_six +; NEW-LDS: @_Z19kernel_two_with_ldsPiS_.Unified.Device.Scope.LDS.Layout +; NEW-LDS: @_Z19kernel_one_with_ldsPiS_.Unified.Device.Scope.LDS.Layout +; OLD-LDS-NOT: @_ZZ21function_six_with_ldsPiS_E16smem_in_func_six +@_ZZ21function_six_with_ldsPiS_E16smem_in_func_six = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16 + +; OFFSET-TABLE: @__LDSGlobalsOffsetTable__ + +; NEW-PARAM: i8 addrspace(3)* %0 +; NEW-PARAM: i64 %1 +define internal fastcc void @_Z21function_six_with_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr { +; GCN-LABEL: entry: +; GCN: %dssv.gep1.1 = getelementptr inbounds [2 x [1 x i64]], [2 x [1 x i64]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 0 +; GCN-NEXT: %dssv.load.1 = load i64, i64 addrspace(4)* %dssv.gep1.1, align 4 +; GCN-NEXT: %dssv.gep2.1 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.1 +; GCN-NEXT: %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep2.1 to [256 x i32] addrspace(3)* +; GCN-NEXT: %2 = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: %idxprom = zext i32 %2 to i64 +; GCN-NEXT: %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom +; GCN-NEXT: %3 = load i32, i32* %arrayidx, align 4 +; GCN-NEXT: %4 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.1, i32 0, i32 %2 +; GCN-NEXT: store i32 %3, i32 addrspace(3)* %4, align 4 +; GCN-NEXT: %add = add nuw nsw i32 %2, 1 +; GCN-NEXT: %5 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.1, i32 0, i32 %add +; GCN-NEXT: %6 = load i32, i32 addrspace(3)* %5, align 4 +; GCN-NEXT: %mul = mul nsw i32 %6, %3 +; GCN-NEXT: %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom +; GCN-NEXT: store i32 %mul, i32* %arrayidx12, align 4 +; GCN-NEXT: ret void +entry: + %0 = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %0 to i64 + %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx31 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ21function_six_with_ldsPiS_E16smem_in_func_six, i32 0, i32 %0 + store i32 %1, i32 addrspace(3)* %arrayidx31, align 4 + %add = add nuw nsw i32 %0, 1 + %arrayidx93 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ21function_six_with_ldsPiS_E16smem_in_func_six, i32 0, i32 %add + %2 = load i32, i32 addrspace(3)* %arrayidx93, align 4 + %mul = mul nsw i32 %2, %1 + %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom + store i32 %mul, i32* %arrayidx12, align 4 + ret void +} + +; NEW-PARAM: i8 addrspace(3)* %0 +; NEW-PARAM: i64 %1 +define internal void @_Z25function_five_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr { +; GCN-LABEL: entry: +; GCN: tail call fastcc void @_Z21function_six_with_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1) +; GCN-NEXT: ret void +entry: + tail call fastcc void @_Z21function_six_with_ldsPiS_(i32* %i_arg, i32* %o_arg) + ret void +} + +; NEW-PARAM: i8 addrspace(3)* %0 +; NEW-PARAM: i64 %1 +define internal void @_Z25function_four_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr { +; GCN-LABEL: entry: +; GCN: tail call fastcc void @_Z21function_six_with_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1) +; GCN-NEXT: ret void +entry: + tail call fastcc void @_Z21function_six_with_ldsPiS_(i32* %i_arg, i32* %o_arg) + ret void +} + +; NEW-PARAM: i8 addrspace(3)* %0 +; NEW-PARAM: i64 %1 +define internal fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg) unnamed_addr { +; GCN-LABEL: entry: +; GCN: %2 = load i32, i32* %i_arg, align 4 +; GCN-NEXT: %cmp = icmp eq i32 %2, 0 +; GCN-NEXT: %3 = select i1 %cmp, void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z25function_four_with_no_ldsPiS_, void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z25function_five_with_no_ldsPiS_ +; GCN-NEXT: tail call void %3(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1), !callees !0 +; GCN-NEXT: ret void +entry: + %0 = load i32, i32* %i_arg, align 4 + %cmp = icmp eq i32 %0, 0 + %func_three_fptr.0 = select i1 %cmp, void (i32*, i32*)* @_Z25function_four_with_no_ldsPiS_, void (i32*, i32*)* @_Z25function_five_with_no_ldsPiS_ + tail call void %func_three_fptr.0(i32* nonnull %i_arg, i32* %o_arg), !callees !10 + ret void +} + +; NEW-PARAM: i8 addrspace(3)* %0 +; NEW-PARAM: i64 %1 +define internal void @_Z24function_two_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg) unnamed_addr { +; GCN-LABEL: entry: +; GCN: tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1) +; GCN-NEXT: ret void +entry: + tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg) + ret void +} + +; NEW-PARAM: i8 addrspace(3)* %0 +; NEW-PARAM: i64 %1 +define internal void @_Z24function_one_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg) unnamed_addr { +; GCN-LABEL: entry: +; GCN: tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1) +; GCN-NEXT: ret void +entry: + tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg) + ret void +} + +define protected amdgpu_kernel void @_Z19kernel_two_with_ldsPiS_(i32 addrspace(1)* %i_arg.coerce, i32 addrspace(1)* %o_arg.coerce) local_unnamed_addr { +; GCN-LABEL: entry: +; GCN: %dssv.gep.{{[1-2]}} = getelementptr inbounds [1024 x i8], [1024 x i8] addrspace(3)* @_Z19kernel_two_with_ldsPiS_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0 +; GCN-NEXT: %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* +; GCN-NEXT: %1 = load i32, i32 addrspace(1)* %i_arg.coerce, align 4 +; GCN-NEXT: %cmp = icmp eq i32 %1, 0 +; GCN-NEXT: %2 = select i1 %cmp, void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z24function_one_with_no_ldsPiS_, void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z24function_two_with_no_ldsPiS_ +; GCN-NEXT: %3 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* +; GCN-NEXT: tail call void %2(i32* %0, i32* %3, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}}), !callees !1 +; GCN-NEXT: ret void +entry: + %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* + %1 = load i32, i32 addrspace(1)* %i_arg.coerce, align 4 + %cmp = icmp eq i32 %1, 0 + %kern_two_fptr.0 = select i1 %cmp, void (i32*, i32*)* @_Z24function_one_with_no_ldsPiS_, void (i32*, i32*)* @_Z24function_two_with_no_ldsPiS_ + %2 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* + tail call void %kern_two_fptr.0(i32* nonnull %0, i32* %2), !callees !11 + ret void +} + +define protected amdgpu_kernel void @_Z19kernel_one_with_ldsPiS_(i32 addrspace(1)* %i_arg.coerce, i32 addrspace(1)* %o_arg.coerce) local_unnamed_addr { +; GCN-LABEL: entry: +; GCN: %dssv.gep.{{[1-2]}} = getelementptr inbounds [1024 x i8], [1024 x i8] addrspace(3)* @_Z19kernel_one_with_ldsPiS_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0 +; GCN-NEXT: %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* +; GCN-NEXT: %1 = load i32, i32 addrspace(1)* %i_arg.coerce, align 4 +; GCN-NEXT: %cmp = icmp eq i32 %1, 0 +; GCN-NEXT: %2 = select i1 %cmp, void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z24function_one_with_no_ldsPiS_, void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z24function_two_with_no_ldsPiS_ +; GCN-NEXT: %3 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* +; GCN-NEXT: tail call void %2(i32* %0, i32* %3, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}}), !callees !1 +; GCN-NEXT: ret void +entry: + %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* + %1 = load i32, i32 addrspace(1)* %i_arg.coerce, align 4 + %cmp = icmp eq i32 %1, 0 + %kern_one_fptr.0 = select i1 %cmp, void (i32*, i32*)* @_Z24function_one_with_no_ldsPiS_, void (i32*, i32*)* @_Z24function_two_with_no_ldsPiS_ + %2 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* + tail call void %kern_one_fptr.0(i32* nonnull %0, i32* %2), !callees !11 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() + +;METADATA: !0 = !{void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z25function_five_with_no_ldsPiS_, void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z25function_four_with_no_ldsPiS_} +;METADATA: !1 = !{void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z24function_one_with_no_ldsPiS_, void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z24function_two_with_no_ldsPiS_} +!10 = !{void (i32*, i32*)* @_Z25function_five_with_no_ldsPiS_, void (i32*, i32*)* @_Z25function_four_with_no_ldsPiS_} +!11 = !{void (i32*, i32*)* @_Z24function_one_with_no_ldsPiS_, void (i32*, i32*)* @_Z24function_two_with_no_ldsPiS_} diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-2d-array.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-2d-array.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-2d-array.ll @@ -0,0 +1,61 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,OFFSET-TABLE,NEW-PARAM,GCN %s + +; OLD-LDS-NOT: @_ZZ17function_with_ldsPiS_E4smem +; NEW-LDS: @_Z18kernel_with_no_ldsPiS_.Unified.Device.Scope.LDS.Layout +; OLD-LDS-NOT: @_ZZ17function_with_ldsPiS_E4smem +@_ZZ17function_with_ldsPiS_E4smem = internal unnamed_addr addrspace(3) global [256 x [4 x i32]] undef, align 16 + +; OFFSET-TABLE: @__LDSGlobalsOffsetTable__ + +; NEW-PARAM: i8 addrspace(3)* %0 +; NEW-PARAM: i64 %1 +define internal fastcc void @_Z17function_with_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr { +; GCN-LABEL: entry: +; GCN: %dssv.gep1.1 = getelementptr inbounds [1 x [1 x i64]], [1 x [1 x i64]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 0 +; GCN-NEXT: %dssv.load.1 = load i64, i64 addrspace(4)* %dssv.gep1.1, align 4 +; GCN-NEXT: %dssv.gep2.1 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.1 +; GCN-NEXT: %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep2.1 to [256 x [4 x i32]] addrspace(3)* +; GCN-NEXT: %2 = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: %idxprom = zext i32 %2 to i64 +; GCN-NEXT: %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom +; GCN-NEXT: %3 = load i32, i32* %arrayidx, align 4 +; GCN-NEXT: %4 = getelementptr inbounds [256 x [4 x i32]], [256 x [4 x i32]] addrspace(3)* %dssv.cast.1, i32 0, i32 %2, i32 2 +; GCN-NEXT: store i32 %3, i32 addrspace(3)* %4, align 8 +; GCN-NEXT: %add = add nuw nsw i32 %2, 1 +; GCN-NEXT: %5 = getelementptr inbounds [256 x [4 x i32]], [256 x [4 x i32]] addrspace(3)* %dssv.cast.1, i32 0, i32 %add, i32 2 +; GCN-NEXT: %6 = load i32, i32 addrspace(3)* %5, align 8 +; GCN-NEXT: %mul = mul nsw i32 %6, %3 +; GCN-NEXT: %arrayidx15 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom +; GCN-NEXT: store i32 %mul, i32* %arrayidx15, align 4 +; GCN-NEXT: ret void +entry: + %0 = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %0 to i64 + %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx42 = getelementptr inbounds [256 x [4 x i32]], [256 x [4 x i32]] addrspace(3)* @_ZZ17function_with_ldsPiS_E4smem, i32 0, i32 %0, i32 2 + store i32 %1, i32 addrspace(3)* %arrayidx42, align 8 + %add = add nuw nsw i32 %0, 1 + %arrayidx126 = getelementptr inbounds [256 x [4 x i32]], [256 x [4 x i32]] addrspace(3)* @_ZZ17function_with_ldsPiS_E4smem, i32 0, i32 %add, i32 2 + %2 = load i32, i32 addrspace(3)* %arrayidx126, align 8 + %mul = mul nsw i32 %2, %1 + %arrayidx15 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom + store i32 %mul, i32* %arrayidx15, align 4 + ret void +} + +define protected amdgpu_kernel void @_Z18kernel_with_no_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr { +; GCN-LABEL: entry: +; GCN: %dssv.gep.1 = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(3)* @_Z18kernel_with_no_ldsPiS_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0 +; GCN-NEXT: %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* +; GCN-NEXT: %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* +; GCN-NEXT: tail call fastcc void @_Z17function_with_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.1, i64 0) +; GCN-NEXT: ret void +entry: + %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* + %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* + tail call fastcc void @_Z17function_with_ldsPiS_(i32* %0, i32* %1) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-with-different-data-types.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-with-different-data-types.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-with-different-data-types.ll @@ -0,0 +1,119 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,OFFSET-TABLE,NEW-PARAM,GCN %s + +; OLD-LDS-NOT: @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6c_smem +; OLD-LDS-NOT: @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6i_smem +; OLD-LDS-NOT: @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6f_smem +; NEW-LDS: @_Z18kernel_with_no_ldsPcS_PiS0_PfS1_.Unified.Device.Scope.LDS.Layout +; OLD-LDS-NOT: @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6c_smem +; OLD-LDS-NOT: @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6i_smem +; OLD-LDS-NOT: @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6f_smem +@_ZZ17function_with_ldsPcS_PiS0_PfS1_E6c_smem = internal unnamed_addr addrspace(3) global [256 x i8] undef, align 16 +@_ZZ17function_with_ldsPcS_PiS0_PfS1_E6i_smem = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16 +@_ZZ17function_with_ldsPcS_PiS0_PfS1_E6f_smem = internal unnamed_addr addrspace(3) global [256 x float] undef, align 16 + +; OFFSET-TABLE: @__LDSGlobalsOffsetTable__ + +; NEW-PARAM: i8 addrspace(3)* %0 +; NEW-PARAM: i64 %1 +define internal fastcc void @_Z17function_with_ldsPcS_PiS0_PfS1_(i8* nocapture readonly %ci_arg, i8* nocapture %co_arg, i32* nocapture readonly %ii_arg, i32* nocapture %io_arg, float* nocapture readonly %fi_arg, float* nocapture %fo_arg) unnamed_addr { +; GCN-LABEL: entry: +; GCN: %dssv.gep1.3 = getelementptr inbounds [1 x [3 x i64]], [1 x [3 x i64]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 {{[0-2]}} +; GCN-NEXT: %dssv.load.3 = load i64, i64 addrspace(4)* %dssv.gep1.3, align 4 +; GCN-NEXT: %dssv.gep2.3 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.3 +; GCN-NEXT: %dssv.cast.3 = bitcast i8 addrspace(3)* %dssv.gep2.3 to [256 x {{[a-z]+[0-9]*}}] addrspace(3)* +; GCN-NEXT: %dssv.gep1.2 = getelementptr inbounds [1 x [3 x i64]], [1 x [3 x i64]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 {{[0-2]}} +; GCN-NEXT: %dssv.load.2 = load i64, i64 addrspace(4)* %dssv.gep1.2, align 4 +; GCN-NEXT: %dssv.gep2.2 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.2 +; GCN-NEXT: %dssv.cast.2 = bitcast i8 addrspace(3)* %dssv.gep2.2 to [256 x i32] addrspace(3)* +; GCN-NEXT: %dssv.gep1.1 = getelementptr inbounds [1 x [3 x i64]], [1 x [3 x {{[a-z]+[0-9]*}}]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 {{[0-2]}} +; GCN-NEXT: %dssv.load.1 = load i64, i64 addrspace(4)* %dssv.gep1.1, align 4 +; GCN-NEXT: %dssv.gep2.1 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.1 +; GCN-NEXT: %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep2.1 to [256 x {{[a-z]+[0-9]*}}] addrspace(3)* +; GCN-NEXT: %2 = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: %idxprom = zext i32 %2 to i64 +; GCN-NEXT: %arrayidx = getelementptr inbounds i8, i8* %ci_arg, i64 %idxprom +; GCN-NEXT: %3 = load i8, i8* %arrayidx, align 1 +; GCN-NEXT: %4 = getelementptr inbounds [256 x i8], [256 x i8] addrspace(3)* %dssv.cast.{{[1-3]}}, i32 0, i32 %2 +; GCN-NEXT: store i8 %3, i8 addrspace(3)* %4, align 1 +; GCN-NEXT: %add = add nuw nsw i32 %2, 1 +; GCN-NEXT: %5 = getelementptr inbounds [256 x i8], [256 x i8] addrspace(3)* %dssv.cast.{{[1-3]}}, i32 0, i32 %add +; GCN-NEXT: %6 = load i8, i8 addrspace(3)* %5, align 1 +; GCN-NEXT: %mul = mul i8 %6, %3 +; GCN-NEXT: %arrayidx14 = getelementptr inbounds i8, i8* %co_arg, i64 %idxprom +; GCN-NEXT: store i8 %mul, i8* %arrayidx14, align 1 +; GCN-NEXT: %arrayidx17 = getelementptr inbounds i32, i32* %ii_arg, i64 %idxprom +; GCN-NEXT: %7 = load i32, i32* %arrayidx17, align 4 +; GCN-NEXT: %8 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.{{[1-3]}}, i32 0, i32 %2 +; GCN-NEXT: store i32 %7, i32 addrspace(3)* %8, align 4 +; GCN-NEXT: %9 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.{{[1-3]}}, i32 0, i32 %add +; GCN-NEXT: %10 = load i32, i32 addrspace(3)* %9, align 4 +; GCN-NEXT: %mul28 = mul nsw i32 %10, %7 +; GCN-NEXT: %arrayidx31 = getelementptr inbounds i32, i32* %io_arg, i64 %idxprom +; GCN-NEXT: store i32 %mul28, i32* %arrayidx31, align 4 +; GCN-NEXT: %arrayidx34 = getelementptr inbounds float, float* %fi_arg, i64 %idxprom +; GCN-NEXT: %11 = load float, float* %arrayidx34, align 4 +; GCN-NEXT: %12 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* %dssv.cast.{{[1-3]}}, i32 0, i32 %2 +; GCN-NEXT: store float %11, float addrspace(3)* %12, align 4 +; GCN-NEXT: %13 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* %dssv.cast.{{[1-3]}}, i32 0, i32 %add +; GCN-NEXT: %14 = load float, float addrspace(3)* %13, align 4 +; GCN-NEXT: %mul45 = fmul contract float %11, %14 +; GCN-NEXT: %arrayidx48 = getelementptr inbounds float, float* %fo_arg, i64 %idxprom +; GCN-NEXT: store float %mul45, float* %arrayidx48, align 4 +; GCN-NEXT: ret void +entry: + %0 = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %0 to i64 + %arrayidx = getelementptr inbounds i8, i8* %ci_arg, i64 %idxprom + %1 = load i8, i8* %arrayidx, align 1 + %arrayidx32 = getelementptr inbounds [256 x i8], [256 x i8] addrspace(3)* @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6c_smem, i32 0, i32 %0 + store i8 %1, i8 addrspace(3)* %arrayidx32, align 1 + %add = add nuw nsw i32 %0, 1 + %arrayidx94 = getelementptr inbounds [256 x i8], [256 x i8] addrspace(3)* @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6c_smem, i32 0, i32 %add + %2 = load i8, i8 addrspace(3)* %arrayidx94, align 1 + %mul = mul i8 %2, %1 + %arrayidx14 = getelementptr inbounds i8, i8* %co_arg, i64 %idxprom + store i8 %mul, i8* %arrayidx14, align 1 + %arrayidx17 = getelementptr inbounds i32, i32* %ii_arg, i64 %idxprom + %3 = load i32, i32* %arrayidx17, align 4 + %arrayidx205 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6i_smem, i32 0, i32 %0 + store i32 %3, i32 addrspace(3)* %arrayidx205, align 4 + %arrayidx277 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6i_smem, i32 0, i32 %add + %4 = load i32, i32 addrspace(3)* %arrayidx277, align 4 + %mul28 = mul nsw i32 %4, %3 + %arrayidx31 = getelementptr inbounds i32, i32* %io_arg, i64 %idxprom + store i32 %mul28, i32* %arrayidx31, align 4 + %arrayidx34 = getelementptr inbounds float, float* %fi_arg, i64 %idxprom + %5 = load float, float* %arrayidx34, align 4 + %arrayidx378 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6f_smem, i32 0, i32 %0 + store float %5, float addrspace(3)* %arrayidx378, align 4 + %arrayidx4410 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6f_smem, i32 0, i32 %add + %6 = load float, float addrspace(3)* %arrayidx4410, align 4 + %mul45 = fmul contract float %5, %6 + %arrayidx48 = getelementptr inbounds float, float* %fo_arg, i64 %idxprom + store float %mul45, float* %arrayidx48, align 4 + ret void +} + +define protected amdgpu_kernel void @_Z18kernel_with_no_ldsPcS_PiS0_PfS1_(i8 addrspace(1)* nocapture readonly %ci_arg.coerce, i8 addrspace(1)* nocapture %co_arg.coerce, i32 addrspace(1)* nocapture readonly %ii_arg.coerce, i32 addrspace(1)* nocapture %io_arg.coerce, float addrspace(1)* nocapture readonly %fi_arg.coerce, float addrspace(1)* nocapture %fo_arg.coerce) local_unnamed_addr { +; GCN-LABEL: entry: +; GCN: %dssv.gep.1 = getelementptr inbounds [2304 x i8], [2304 x i8] addrspace(3)* @_Z18kernel_with_no_ldsPcS_PiS0_PfS1_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0 +; GCN-NEXT: %0 = addrspacecast i8 addrspace(1)* %ci_arg.coerce to i8* +; GCN-NEXT: %1 = addrspacecast i8 addrspace(1)* %co_arg.coerce to i8* +; GCN-NEXT: %2 = addrspacecast i32 addrspace(1)* %ii_arg.coerce to i32* +; GCN-NEXT: %3 = addrspacecast i32 addrspace(1)* %io_arg.coerce to i32* +; GCN-NEXT: %4 = addrspacecast float addrspace(1)* %fi_arg.coerce to float* +; GCN-NEXT: %5 = addrspacecast float addrspace(1)* %fo_arg.coerce to float* +; GCN-NEXT: tail call fastcc void @_Z17function_with_ldsPcS_PiS0_PfS1_(i8* %0, i8* %1, i32* %2, i32* %3, float* %4, float* %5, i8 addrspace(3)* %dssv.gep.1, i64 0) +; GCN-NEXT: ret void +entry: + %0 = addrspacecast i8 addrspace(1)* %ci_arg.coerce to i8* + %1 = addrspacecast i8 addrspace(1)* %co_arg.coerce to i8* + %2 = addrspacecast i32 addrspace(1)* %ii_arg.coerce to i32* + %3 = addrspacecast i32 addrspace(1)* %io_arg.coerce to i32* + %4 = addrspacecast float addrspace(1)* %fi_arg.coerce to float* + %5 = addrspacecast float addrspace(1)* %fo_arg.coerce to float* + tail call fastcc void @_Z17function_with_ldsPcS_PiS0_PfS1_(i8* %0, i8* %1, i32* %2, i32* %3, float* %4, float* %5) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-function.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-function.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-function.ll @@ -0,0 +1,61 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,OFFSET-TABLE,NEW-PARAM,GCN %s + +; OLD-LDS-NOT: @_ZZ17function_with_ldsPiS_E4smem +; NEW-LDS: @_Z18kernel_with_no_ldsPiS_.Unified.Device.Scope.LDS.Layout +; OLD-LDS-NOT: @_ZZ17function_with_ldsPiS_E4smem +@_ZZ17function_with_ldsPiS_E4smem = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16 + +; OFFSET-TABLE: @__LDSGlobalsOffsetTable__ + +; NEW-PARAM: i8 addrspace(3)* %0 +; NEW-PARAM: i64 %1 +define internal fastcc void @_Z17function_with_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr { +; GCN-LABEL: entry: +; GCN: %dssv.gep1.1 = getelementptr inbounds [1 x [1 x i64]], [1 x [1 x i64]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 0 +; GCN-NEXT: %dssv.load.1 = load i64, i64 addrspace(4)* %dssv.gep1.1, align 4 +; GCN-NEXT: %dssv.gep2.1 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.1 +; GCN-NEXT: %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep2.1 to [256 x i32] addrspace(3)* +; GCN-NEXT: %2 = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: %idxprom = zext i32 %2 to i64 +; GCN-NEXT: %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom +; GCN-NEXT: %3 = load i32, i32* %arrayidx, align 4 +; GCN-NEXT: %4 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.1, i32 0, i32 %2 +; GCN-NEXT: store i32 %3, i32 addrspace(3)* %4, align 4 +; GCN-NEXT: %add = add nuw nsw i32 %2, 1 +; GCN-NEXT: %5 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.1, i32 0, i32 %add +; GCN-NEXT: %6 = load i32, i32 addrspace(3)* %5, align 4 +; GCN-NEXT: %mul = mul nsw i32 %6, %3 +; GCN-NEXT: %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom +; GCN-NEXT: store i32 %mul, i32* %arrayidx12, align 4 +; GCN-NEXT: ret void +entry: + %0 = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %0 to i64 + %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx31 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ17function_with_ldsPiS_E4smem, i32 0, i32 %0 + store i32 %1, i32 addrspace(3)* %arrayidx31, align 4 + %add = add nuw nsw i32 %0, 1 + %arrayidx93 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ17function_with_ldsPiS_E4smem, i32 0, i32 %add + %2 = load i32, i32 addrspace(3)* %arrayidx93, align 4 + %mul = mul nsw i32 %2, %1 + %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom + store i32 %mul, i32* %arrayidx12, align 4 + ret void +} + +define protected amdgpu_kernel void @_Z18kernel_with_no_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr { +; GCN-LABEL: entry: +; GCN: %dssv.gep.1 = getelementptr inbounds [1024 x i8], [1024 x i8] addrspace(3)* @_Z18kernel_with_no_ldsPiS_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0 +; GCN-NEXT: %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* +; GCN-NEXT: %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* +; GCN-NEXT: tail call fastcc void @_Z17function_with_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.1, i64 0) +; GCN-NEXT: ret void +entry: + %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* + %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* + tail call fastcc void @_Z17function_with_ldsPiS_(i32* %0, i32* %1) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-kernel-and-function.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-kernel-and-function.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-kernel-and-function.ll @@ -0,0 +1,87 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,OFFSET-TABLE,NEW-PARAM,GCN %s + +; OLD-LDS-NOT: @_ZZ17function_with_ldsPiS_E8smem_dev +; OLD-LDS: @_ZZ15kernel_with_ldsPiS_E9smem_kern +; NEW-LDS: @_Z15kernel_with_ldsPiS_.Unified.Device.Scope.LDS.Layout +; OLD-LDS-NOT: @_ZZ17function_with_ldsPiS_E8smem_dev +@_ZZ17function_with_ldsPiS_E8smem_dev = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16 +@_ZZ15kernel_with_ldsPiS_E9smem_kern = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16 + +; OFFSET-TABLE: @__LDSGlobalsOffsetTable__ + +; NEW-PARAM: i8 addrspace(3)* %0 +; NEW-PARAM: i64 %1 +define internal fastcc void @_Z17function_with_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr { +; GCN-LABEL: entry: +; GCN: %dssv.gep1.1 = getelementptr inbounds [1 x [1 x i64]], [1 x [1 x i64]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 0 +; GCN-NEXT: %dssv.load.1 = load i64, i64 addrspace(4)* %dssv.gep1.1, align 4 +; GCN-NEXT: %dssv.gep2.1 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.1 +; GCN-NEXT: %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep2.1 to [256 x i32] addrspace(3)* +; GCN-NEXT: %2 = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: %idxprom = zext i32 %2 to i64 +; GCN-NEXT: %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom +; GCN-NEXT: %3 = load i32, i32* %arrayidx, align 4 +; GCN-NEXT: %4 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.1, i32 0, i32 %2 +; GCN-NEXT: store i32 %3, i32 addrspace(3)* %4, align 4 +; GCN-NEXT: %add = add nuw nsw i32 %2, 1 +; GCN-NEXT: %5 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.1, i32 0, i32 %add +; GCN-NEXT: %6 = load i32, i32 addrspace(3)* %5, align 4 +; GCN-NEXT: %mul = mul nsw i32 %6, %3 +; GCN-NEXT: %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom +; GCN-NEXT: store i32 %mul, i32* %arrayidx12, align 4 +; GCN-NEXT: ret void +entry: + %0 = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %0 to i64 + %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx31 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ17function_with_ldsPiS_E8smem_dev, i32 0, i32 %0 + store i32 %1, i32 addrspace(3)* %arrayidx31, align 4 + %add = add nuw nsw i32 %0, 1 + %arrayidx93 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ17function_with_ldsPiS_E8smem_dev, i32 0, i32 %add + %2 = load i32, i32 addrspace(3)* %arrayidx93, align 4 + %mul = mul nsw i32 %2, %1 + %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom + store i32 %mul, i32* %arrayidx12, align 4 + ret void +} + +define protected amdgpu_kernel void @_Z15kernel_with_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr { +; GCN-LABEL: entry: +; GCN: %dssv.gep.1 = getelementptr inbounds [1024 x i8], [1024 x i8] addrspace(3)* @_Z15kernel_with_ldsPiS_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0 +; GCN-NEXT: %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* +; GCN-NEXT: %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* +; GCN-NEXT: %2 = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: %idxprom = zext i32 %2 to i64 +; GCN-NEXT: %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom +; GCN-NEXT: %3 = load i32, i32 addrspace(1)* %arrayidx3, align 4 +; GCN-NEXT: %arrayidx54 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E9smem_kern, i32 0, i32 %2 +; GCN-NEXT: store i32 %3, i32 addrspace(3)* %arrayidx54, align 4 +; GCN-NEXT: %add = add nuw nsw i32 %2, 1 +; GCN-NEXT: %arrayidx116 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E9smem_kern, i32 0, i32 %add +; GCN-NEXT: %4 = load i32, i32 addrspace(3)* %arrayidx116, align 4 +; GCN-NEXT: %mul = mul nsw i32 %4, %3 +; GCN-NEXT: %arrayidx147 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom +; GCN-NEXT: store i32 %mul, i32 addrspace(1)* %arrayidx147, align 4 +; GCN-NEXT: tail call fastcc void @_Z17function_with_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.1, i64 0) +; GCN-NEXT: ret void +entry: + %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* + %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* + %2 = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %2 to i64 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom + %3 = load i32, i32 addrspace(1)* %arrayidx3, align 4 + %arrayidx54 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E9smem_kern, i32 0, i32 %2 + store i32 %3, i32 addrspace(3)* %arrayidx54, align 4 + %add = add nuw nsw i32 %2, 1 + %arrayidx116 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E9smem_kern, i32 0, i32 %add + %4 = load i32, i32 addrspace(3)* %arrayidx116, align 4 + %mul = mul nsw i32 %4, %3 + %arrayidx147 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom + store i32 %mul, i32 addrspace(1)* %arrayidx147, align 4 + tail call fastcc void @_Z17function_with_ldsPiS_(i32* %0, i32* %1) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-kernel.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-kernel.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-kernel.ll @@ -0,0 +1,68 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,OFFSET-TABLE,NEW-PARAM,GCN %s + +; NEW-LDS-NOT: @_Z15kernel_with_ldsPiS_.Unified.Device.Scope.LDS.Layout +; OLD-LDS: @_ZZ15kernel_with_ldsPiS_E4smem +; NEW-LDS-NOT: @_Z15kernel_with_ldsPiS_.Unified.Device.Scope.LDS.Layout +@_ZZ15kernel_with_ldsPiS_E4smem = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16 + +; OFFSET-TABLE-NOT: @__LDSGlobalsOffsetTable__ + +; NEW-PARAM-NOT: i8 addrspace(3)* %0 +; NEW-PARAM-NOT: i64 %1 +define internal fastcc void @_Z20function_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr { +; GCN-LABEL: entry: +; GCN: %0 = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: %idxprom = zext i32 %0 to i64 +; GCN-NEXT: %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom +; GCN-NEXT: %1 = load i32, i32* %arrayidx, align 4 +; GCN-NEXT: %arrayidx3 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom +; GCN-NEXT: store i32 %1, i32* %arrayidx3, align 4 +; GCN-NEXT: ret void +entry: + %0 = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %0 to i64 + %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom + store i32 %1, i32* %arrayidx3, align 4 + ret void +} + +define protected amdgpu_kernel void @_Z15kernel_with_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr { +; GCN-LABEL: entry: +; GCN: %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* +; GCN-NEXT: %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* +; GCN-NEXT: %2 = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: %idxprom = zext i32 %2 to i64 +; GCN-NEXT: %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom +; GCN-NEXT: %3 = load i32, i32 addrspace(1)* %arrayidx3, align 4 +; GCN-NEXT: %arrayidx54 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E4smem, i32 0, i32 %2 +; GCN-NEXT: store i32 %3, i32 addrspace(3)* %arrayidx54, align 4 +; GCN-NEXT: %add = add nuw nsw i32 %2, 1 +; GCN-NEXT: %arrayidx116 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E4smem, i32 0, i32 %add +; GCN-NEXT: %4 = load i32, i32 addrspace(3)* %arrayidx116, align 4 +; GCN-NEXT: %mul = mul nsw i32 %4, %3 +; GCN-NEXT: %arrayidx147 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom +; GCN-NEXT: store i32 %mul, i32 addrspace(1)* %arrayidx147, align 4 +; GCN-NEXT: tail call fastcc void @_Z20function_with_no_ldsPiS_(i32* %0, i32* %1) +; GCN-NEXT: ret void +entry: + %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* + %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* + %2 = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %2 to i64 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom + %3 = load i32, i32 addrspace(1)* %arrayidx3, align 4 + %arrayidx54 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E4smem, i32 0, i32 %2 + store i32 %3, i32 addrspace(3)* %arrayidx54, align 4 + %add = add nuw nsw i32 %2, 1 + %arrayidx116 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E4smem, i32 0, i32 %add + %4 = load i32, i32 addrspace(3)* %arrayidx116, align 4 + %mul = mul nsw i32 %4, %3 + %arrayidx147 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom + store i32 %mul, i32 addrspace(1)* %arrayidx147, align 4 + tail call fastcc void @_Z20function_with_no_ldsPiS_(i32* %0, i32* %1) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-two-lds-arguments.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-two-lds-arguments.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-two-lds-arguments.ll @@ -0,0 +1,182 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,OFFSET-TABLE,NEW-PARAM,GCN %s + +; OLD-LDS-NOT: @_ZZ22function_four_with_ldsPiS_E19smem_1_in_func_four +; OLD-LDS-NOT: @_ZZ22function_four_with_ldsPiS_E19smem_2_in_func_four +; OLD-LDS: @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two +; OLD-LDS: @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one +; NEW-LDS: @_Z19kernel_two_with_ldsPiS_.Unified.Device.Scope.LDS.Layout +; NEW-LDS: @_Z19kernel_one_with_ldsPiS_.Unified.Device.Scope.LDS.Layout +; OLD-LDS-NOT: @_ZZ22function_four_with_ldsPiS_E19smem_1_in_func_four +; OLD-LDS-NOT: @_ZZ22function_four_with_ldsPiS_E19smem_2_in_func_four +@_ZZ22function_four_with_ldsPiS_E19smem_1_in_func_four = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16 +@_ZZ22function_four_with_ldsPiS_E19smem_2_in_func_four = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16 +@_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16 +@_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16 + +; OFFSET-TABLE: @__LDSGlobalsOffsetTable__ + +; NEW-PARAM: i8 addrspace(3)* %0 +; NEW-PARAM: i64 %1 +define internal fastcc void @_Z22function_four_with_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr { +; GCN-LABEL: entry: +; GCN: %dssv.gep1.2 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 {{[0-1]}} +; GCN-NEXT: %dssv.load.2 = load i64, i64 addrspace(4)* %dssv.gep1.2, align 4 +; GCN-NEXT: %dssv.gep2.2 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.2 +; GCN-NEXT: %dssv.cast.2 = bitcast i8 addrspace(3)* %dssv.gep2.2 to [256 x i32] addrspace(3)* +; GCN-NEXT: %dssv.gep1.1 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 {{[0-1]}} +; GCN-NEXT: %dssv.load.1 = load i64, i64 addrspace(4)* %dssv.gep1.1, align 4 +; GCN-NEXT: %dssv.gep2.1 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.1 +; GCN-NEXT: %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep2.1 to [256 x i32] addrspace(3)* +; GCN-NEXT: %2 = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: %idxprom = zext i32 %2 to i64 +; GCN-NEXT: %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom +; GCN-NEXT: %3 = load i32, i32* %arrayidx, align 4 +; GCN-NEXT: %4 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.{{[1-2]}}, i32 0, i32 %2 +; GCN-NEXT: store i32 %3, i32 addrspace(3)* %4, align 4 +; GCN-NEXT: %add = add nuw nsw i32 %2, 1 +; GCN-NEXT: %5 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.{{[1-2]}}, i32 0, i32 %add +; GCN-NEXT: %6 = load i32, i32 addrspace(3)* %5, align 4 +; GCN-NEXT: %add10 = add nsw i32 %3, %6 +; GCN-NEXT: %7 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.{{[1-2]}}, i32 0, i32 %2 +; GCN-NEXT: store i32 %add10, i32 addrspace(3)* %7, align 4 +; GCN-NEXT: %8 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.{{[1-2]}}, i32 0, i32 %add +; GCN-NEXT: %9 = load i32, i32 addrspace(3)* %8, align 4 +; GCN-NEXT: %mul = mul nsw i32 %9, %3 +; GCN-NEXT: %arrayidx23 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom +; GCN-NEXT: store i32 %mul, i32* %arrayidx23, align 4 +; GCN-NEXT: ret void +entry: + %0 = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %0 to i64 + %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom + %1 = load i32, i32* %arrayidx, align 4 + %arrayidx32 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ22function_four_with_ldsPiS_E19smem_1_in_func_four, i32 0, i32 %0 + store i32 %1, i32 addrspace(3)* %arrayidx32, align 4 + %add = add nuw nsw i32 %0, 1 + %arrayidx63 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ22function_four_with_ldsPiS_E19smem_1_in_func_four, i32 0, i32 %add + %2 = load i32, i32 addrspace(3)* %arrayidx63, align 4 + %add10 = add nsw i32 %1, %2 + %arrayidx134 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ22function_four_with_ldsPiS_E19smem_2_in_func_four, i32 0, i32 %0 + store i32 %add10, i32 addrspace(3)* %arrayidx134, align 4 + %arrayidx206 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ22function_four_with_ldsPiS_E19smem_2_in_func_four, i32 0, i32 %add + %3 = load i32, i32 addrspace(3)* %arrayidx206, align 4 + %mul = mul nsw i32 %3, %1 + %arrayidx23 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom + store i32 %mul, i32* %arrayidx23, align 4 + ret void +} + +; NEW-PARAM: i8 addrspace(3)* %0 +; NEW-PARAM: i64 %1 +define internal fastcc void @_Z26function_three_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr { +; GCN-LABEL: entry: +; GCN: tail call fastcc void @_Z22function_four_with_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1) +; GCN-NEXT: ret void +entry: + tail call fastcc void @_Z22function_four_with_ldsPiS_(i32* %i_arg, i32* %o_arg) + ret void +} + +; NEW-PARAM: i8 addrspace(3)* %0 +; NEW-PARAM: i64 %1 +define internal fastcc void @_Z24function_two_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr { +; GCN-LABEL: entry: +; GCN: tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1) +; GCN-NEXT: ret void +entry: + tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg) + ret void +} + +; NEW-PARAM: i8 addrspace(3)* %0 +; NEW-PARAM: i64 %1 +define internal fastcc void @_Z24function_one_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr { +; GCN-LABEL: entry: +; GCN: tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1) +; GCN-NEXT: ret void +entry: + tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg) + ret void +} + +define protected amdgpu_kernel void @_Z19kernel_two_with_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr { +; GCN-LABEL: entry: +; GCN: %dssv.gep.{{[1-2]}} = getelementptr inbounds [2048 x i8], [2048 x i8] addrspace(3)* @_Z19kernel_two_with_ldsPiS_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0 +; GCN-NEXT: %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* +; GCN-NEXT: %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* +; GCN-NEXT: %2 = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: %idxprom = zext i32 %2 to i64 +; GCN-NEXT: %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom +; GCN-NEXT: %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4 +; GCN-NEXT: %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %2 +; GCN-NEXT: store i32 %3, i32 addrspace(3)* %arrayidx57, align 4 +; GCN-NEXT: %add = add nuw nsw i32 %2, 1 +; GCN-NEXT: %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %add +; GCN-NEXT: %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4 +; GCN-NEXT: %mul = mul nsw i32 %4, %3 +; GCN-NEXT: %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom +; GCN-NEXT: store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4 +; GCN-NEXT: tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}}) +; GCN-NEXT: tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}}) +; GCN-NEXT: ret void +entry: + %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* + %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* + %2 = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %2 to i64 + %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom + %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4 + %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %2 + store i32 %3, i32 addrspace(3)* %arrayidx57, align 4 + %add = add nuw nsw i32 %2, 1 + %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %add + %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4 + %mul = mul nsw i32 %4, %3 + %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom + store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4 + tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1) + tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1) + ret void +} + +define protected amdgpu_kernel void @_Z19kernel_one_with_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr { +; GCN-LABEL: entry: +; GCN: %dssv.gep.{{[1-2]}} = getelementptr inbounds [2048 x i8], [2048 x i8] addrspace(3)* @_Z19kernel_one_with_ldsPiS_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0 +; GCN-NEXT: %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* +; GCN-NEXT: %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* +; GCN-NEXT: %2 = tail call i32 @llvm.amdgcn.workitem.id.x() +; GCN-NEXT: %idxprom = zext i32 %2 to i64 +; GCN-NEXT: %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom +; GCN-NEXT: %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4 +; GCN-NEXT: %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %2 +; GCN-NEXT: store i32 %3, i32 addrspace(3)* %arrayidx57, align 4 +; GCN-NEXT: %add = add nuw nsw i32 %2, 1 +; GCN-NEXT: %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %add +; GCN-NEXT: %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4 +; GCN-NEXT: %mul = mul nsw i32 %4, %3 +; GCN-NEXT: %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom +; GCN-NEXT: store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4 +; GCN-NEXT: tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}}) +; GCN-NEXT: tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}}) +; GCN-NEXT: ret void +entry: + %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32* + %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32* + %2 = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %2 to i64 + %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom + %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4 + %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %2 + store i32 %3, i32 addrspace(3)* %arrayidx57, align 4 + %add = add nuw nsw i32 %2, 1 + %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %add + %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4 + %mul = mul nsw i32 %4, %3 + %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom + store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4 + tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1) + tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x()