diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -341,6 +341,15 @@ void initializeGCNNSAReassignPass(PassRegistry &); extern char &GCNNSAReassignID; +ModulePass *createAMDGPUReplaceLDSUseWithPointerPass(); +void initializeAMDGPUReplaceLDSUseWithPointerPass(PassRegistry &); +extern char &AMDGPUReplaceLDSUseWithPointerID; +struct AMDGPUReplaceLDSUseWithPointerPass + : PassInfoMixin { + AMDGPUReplaceLDSUseWithPointerPass() {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -116,14 +116,21 @@ // OpenCL doesn't allow declaring LDS in non-kernels, so in practice this // should only appear when IPO passes manages to move LDs defined in a kernel // into a single user function. - - for (GlobalVariable &GV : M.globals()) { - // TODO: Region address - unsigned AS = GV.getAddressSpace(); - if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) - continue; - - recursivelyVisitUsers(GV, FuncsToAlwaysInline); + // + // Since now, LDS uses within non-kernel functions are being handled in the + // pass - `LowerModuleLDS`, we *NO* need to *forcefully* inline non-kernel + // functions just because they use LDS. Do forceful inlining only when the + // pass - `LowerModuleLDS` is not enabled. It is enabled by default. + + if (!AMDGPUTargetMachine::EnableLowerModuleLDS) { + for (GlobalVariable &GV : M.globals()) { + // TODO: Region address + unsigned AS = GV.getAddressSpace(); + if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) + continue; + + recursivelyVisitUsers(GV, FuncsToAlwaysInline); + } } if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -28,6 +28,7 @@ #include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" +#include "Utils/AMDGPUGeneralUtils.h" #include "llvm/ADT/STLExtras.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" @@ -53,87 +54,13 @@ return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv()); } - static Align getAlign(DataLayout const &DL, const GlobalVariable *GV) { - return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL), - GV->getValueType()); - } - - static bool - userRequiresLowering(const SmallPtrSetImpl &UsedList, - User *InitialUser) { - // Any LDS variable can be lowered by moving into the created struct - // Each variable so lowered is allocated in every kernel, so variables - // whose users are all known to be safe to lower without the transform - // are left unchanged. - SmallPtrSet Visited; - SmallVector Stack; - Stack.push_back(InitialUser); - - while (!Stack.empty()) { - User *V = Stack.pop_back_val(); - Visited.insert(V); - - if (auto *G = dyn_cast(V->stripPointerCasts())) { - if (UsedList.contains(G)) { - continue; - } - } - - if (auto *I = dyn_cast(V)) { - if (isKernelCC(I->getFunction())) { - continue; - } - } - - if (auto *E = dyn_cast(V)) { - for (Value::user_iterator EU = E->user_begin(); EU != E->user_end(); - ++EU) { - if (Visited.insert(*EU).second) { - Stack.push_back(*EU); - } - } - continue; - } - - // Unknown user, conservatively lower the variable - return true; - } - - return false; - } - static std::vector findVariablesToLower(Module &M, const SmallPtrSetImpl &UsedList) { std::vector LocalVars; for (auto &GV : M.globals()) { - if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { - continue; - } - if (!GV.hasInitializer()) { - // addrspace(3) without initializer implies cuda/hip extern __shared__ - // the semantics for such a variable appears to be that all extern - // __shared__ variables alias one another, in which case this transform - // is not required - continue; - } - if (!isa(GV.getInitializer())) { - // Initializers are unimplemented for local address space. - // Leave such variables in place for consistent error reporting. - continue; - } - if (GV.isConstant()) { - // A constant undef variable can't be written to, and any load is - // undef, so it should be eliminated by the optimizer. It could be - // dropped by the back end if not. This pass skips over it. - continue; - } - if (std::none_of(GV.user_begin(), GV.user_end(), [&](User *U) { - return userRequiresLowering(UsedList, U); - })) { - continue; - } - LocalVars.push_back(&GV); + if (isLDSLowereringRequired(&GV, UsedList, /*IsLDSLoweringPass=*/true)) + LocalVars.push_back(&GV); } return LocalVars; } @@ -217,20 +144,6 @@ ""); } - static SmallPtrSet getUsedList(Module &M) { - SmallPtrSet UsedList; - - SmallVector TmpVec; - collectUsedGlobalVariables(M, TmpVec, true); - UsedList.insert(TmpVec.begin(), TmpVec.end()); - - TmpVec.clear(); - collectUsedGlobalVariables(M, TmpVec, false); - UsedList.insert(TmpVec.begin(), TmpVec.end()); - - return UsedList; - } - public: static char ID; @@ -365,9 +278,16 @@ char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID; -INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE, - "Lower uses of LDS variables from non-kernel functions", false, - false) +INITIALIZE_PASS_BEGIN(AMDGPULowerModuleLDS, DEBUG_TYPE, + "Lower uses of LDS variables from non-kernel functions", + false, false) +// Before runnning current LDS lower pass, replace LDS uses within non-kernel +// functions by pointers so that the current pass minimizes the unnecessary per +// kernel allocation of LDS memory. +INITIALIZE_PASS_DEPENDENCY(AMDGPUReplaceLDSUseWithPointer) +INITIALIZE_PASS_END(AMDGPULowerModuleLDS, DEBUG_TYPE, + "Lower uses of LDS variables from non-kernel functions", + false, false) ModulePass *llvm::createAMDGPULowerModuleLDSPass() { return new AMDGPULowerModuleLDS(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp @@ -0,0 +1,758 @@ +//===-- AMDGPUReplaceLDSUseWithPointer.cpp --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// One of the memory types being supported within AMD GPU memory hierarchy is +// `shared memory`, also called `Local Data Share` or LDS for short. LDS memory +// is the `second` fastest memory in the AMD GPU memory hierarchy (with register +// file being fastest available memory in the hierarchy). Being faster also +// means LDS memory is comparatively costlier and hence is a `limited` available +// memory resource. +// +// Being global scoped, an LDS variable is accessible within kernel functions +// and non-kernel functions, but two different kernel execution paths, say +// called from two kernels K1 and K2, cannot access the same instance of an LDS +// variable, say L. Both K1 and K2 has to own its own instance of L. This puts +// some challenges, especially to lower the LDS variables used within non-kernel +// functions. +// +// So, the pass - "Lower Module LDS" lowers the LDS globals by packing them +// within in a struct type, and by creating an instance of that struct type +// within every kerenl at address zero. Though, the pass - "Lower Module LDS" +// makes some effort to minimize unnecessary LDS allocation, it is limited by +// means of the fundamental basis and assumption upon which the pass is +// implemented. +// +// The current pass acts as an helping aid to the pass - "Lower Module LDS" with +// the intention of minimizing unnecessary LDS allocation as much as possible. +// +// The main idea behind the current pass is: +// +// (1) To identify the LDS globals used within non-kernel function scope and +// global scope, +// (2) To push the use of all the above identified LDS globals to kernel +// function scope by initializing their addresses to newly created LDS +// global pointer variables (within kernel functions), +// (3) To replace the uses of original LDS globals within non-kernel functions +// by their pointer counter-parts. +// (4) This way, the transformation makes sure that the pass "Lower Module LDS" +// packs only pointer variables within struct type, and hence significantly +// minimizes unnecessary LDS allocation, espacically when the original LDS +// globals are big arrays (as this is the common LDS use case). +// +// NOTE: The pass - "Lower Module LDS" now has a tight dependency on the current +// pass, and the current pass should always be run before running the pass +// "Lower Module LDS". Running the pass "Lower Module LDS" alone may lead +// to surprizing results. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "Utils/AMDGPUGeneralUtils.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include +#include +#include + +#define DEBUG_TYPE "amdgpu-replace-lds-use-with-pointer" + +using namespace llvm; + +namespace { + +// Error kinds for handling the errors within the context of current pass. +enum ReplaceLDSErrorKind : uint32_t { + LLEK_EndOfList = 0u, + LLEK_InternalError = 2u, + LLEK_NoCalleeDefinitionError = 3u +}; + +} // namespace + +// Report error within the context of current pass based on the error kind. +static void reportReplaceLDSError(ReplaceLDSErrorKind EK, Value *V = nullptr) { + std::string ErrStr("The pass \"Replace LDS Use With Pointer\" "); + + switch (EK) { + default: + case LLEK_InternalError: { + ErrStr = ErrStr + std::string("has encountered an internal error."); + break; + } + case LLEK_NoCalleeDefinitionError: { + ErrStr = + ErrStr + + std::string("assumes that the definitions of both caller and callee " + "appear within same module. But, definition for the " + "callee \"") + + V->getName().str() + std::string("\" not available."); + break; + } + } + + report_fatal_error(ErrStr); +} + +// Helper function around `ValueMap` to detect if an element exists within it. +template +static bool contains(R &&VMap, const E &Element) { + return VMap.find(Element) != VMap.end(); +} + +// Within User `U` replace the use(s) of `OldValue` by `NewValue`. +static void updateUserOperand(User *U, Value *OldValue, Value *NewValue) { + unsigned Ind = 0; + for (Use &UU : U->operands()) { + if (UU.get() == OldValue) + U->setOperand(Ind, NewValue); + ++Ind; + } +} + +// Convert `ConstantExpr CE` to a corresponding set of instructions, and update +// users of `CE` to use corresponding instructions. +static Instruction * +replaceConstExprByInst(ConstantExpr *CE, + SmallPtrSetImpl &Insts) { + Instruction *NI = nullptr; + + SmallVector CEUsers; + append_range(CEUsers, CE->users()); + + for (auto *U : CEUsers) { + auto *I = dyn_cast(U); + if (!I) { + auto *CE2 = dyn_cast(U); + assert(CE2 && "Constant expression expected."); + I = replaceConstExprByInst(CE2, Insts); + } + + NI = CE->getAsInstruction(); + NI->insertBefore(I); + updateUserOperand(I, CE, NI); + CE->removeDeadConstantUsers(); + Insts.insert(NI); + } + + assert(NI && "Instruction expected."); + + return NI; +} + +// `U` should be either `Instruction` OR `ConstantExpr`. If it is `Instruction` +// return it, if it is `ConstantExpr` break it into a set of instructions and +// return it. +static void getInstructions(User *U, SmallPtrSetImpl &Insts) { + if (auto *I = dyn_cast(U)) { + // Return instruction `I`. + Insts.insert(I); + } else if (auto *CE = dyn_cast(U)) { + // Break const expression `CE` into a set of instructions. + replaceConstExprByInst(CE, Insts); + } else { + // Unexpected control flow - what else is missing? + reportReplaceLDSError(LLEK_InternalError); + } +} + +// Return true if the user `U` is a global variable. +static bool isUserGlobalVariable(User *U) { + SmallVector UserStack; + SmallPtrSet VisitedUsers; + + UserStack.push_back(U); + + while (!UserStack.empty()) { + auto *U = UserStack.pop_back_val(); + + if (!VisitedUsers.insert(U).second) + continue; + + if (isa(U)) + return true; + + if (isa(U)) { + append_range(UserStack, U->users()); + continue; + } + + if (isa(U)) + return false; + } + + return false; +} + +// Collect functions whose address is taken within the module. +static void collectAddressTakenFunctions( + CallGraph &CG, SmallPtrSetImpl &AddressTakenSet) { + auto *ExternalCallingNode = CG.getExternalCallingNode(); + for (auto GI = ExternalCallingNode->begin(), GE = ExternalCallingNode->end(); + GI != GE; ++GI) { + auto *CGN = GI->second; + auto *F = CGN->getFunction(); + // Note that we intentionally collect "declared only" address taken fuctions + // too here, but later, error will be thrown when we check for the + // definition of callees since this pass assumes that both caller and callee + // appear within the same module. + // FIXME: Anything else need to be excluded? + if (!F || AMDGPU::isModuleEntryFunctionCC(F->getCallingConv())) + continue; + AddressTakenSet.insert(CGN); + } +} + +namespace { + +class ReplaceLDSUseImpl { + Module &M; + LLVMContext &Ctx; + const DataLayout &DL; + + // Holds all kernels defined within the module `M`. + SmallPtrSet Kernels; + + // Holds all LDS globals defined within the module `M`. + SmallPtrSet LDSGlobals; + + // Holds all those LDS globals which are used as initializers within some + // other global variable definitions. + SmallPtrSet LDSGlobalsAsInitializers; + + // Associates LDS global to a list of functions which references that LDS. + ValueMap> LDSGlobalToAccessors; + + // Associates function to a list of LDS globals which are referenced within + // that function. + ValueMap> AccessorToLDSGlobals; + + // Associates kernel to a list of non-kernel functions which are reachable + // from that kernel. + ValueMap> KernelToCallees; + + // Associates kernel to a list of LDS globals which are referenced along the + // run time kernel execution paths (within non-kernel functions) associated + // with that kernel. + ValueMap> KernelToLDSGlobals; + + // Associates LDS global to a unique pointer which points to that LDS global. + ValueMap LDSToPointer; + + // Associates non-kernel function to an LDS global to a list of int-to-ptr + // instructions. + std::map> FunctionToLDSToInst; + +public: + explicit ReplaceLDSUseImpl(Module &M) + : M(M), Ctx(M.getContext()), DL(M.getDataLayout()) {} + + // Entry-point function. + bool replace(); + +private: + //===--------------------------------------------------------------------===// + // Methods which aid in creating new global LDS pointers which point to + // original LDS globals which are referenced within non-kernel functions. + //===--------------------------------------------------------------------===// + + // Construct an `IntToPtr` instruction which replaces `LDS` within F. + Value *getIntToPtrInst(Function *F, GlobalVariable *LDS, + GlobalVariable *LDSPointer); + + // Replace all uses of original LDS globals within all non-kernel functions by + // their respective LDS poitners. + void replaceUsesOfLDSGlobalsByPointers(); + + // Insert global LDS pointers (which point to original LDS globals which are + // referenced within non-kernel functions) and initialize them within kernels + // to point to respective LDS globals. + void insertAndInitializeLDSPointers(); + + //===--------------------------------------------------------------------===// + // Methods which aid in creating the various `map` data structures. + //===--------------------------------------------------------------------===// + + // Associate each kernel K with LDS globals which are being accessed by K + // and/or by the callees of K. + void createKernelToLDSGlobalsMap(); + + // Collect all call graph nodes which are reachable from the node `CGN`. + void + collectReachableCallGraphNodes(CallGraphNode *CGN, + SetVector &ReachableCGNodes); + + // Resolve all indirect call sites within the the call graph node `CGN`. + void + resolveIndirectCallSites(CallGraphNode *CGN, CallGraph &CG, + SmallPtrSetImpl &AddressTakenSet, + SetVector &ReachableCGNodes); + + // Traverse `CallGraph` starting from the `CallGraphNode` associated with each + // kernel `K` and collect all callees which are reachable from K (including + // indirectly called callees). + void createKernelToCalleesMap(); + + // Associate each kernel/function with the LDS globals which are being + // accessed within them. + void createAccessorToLDSGlobalsMap(); + + // For each `LDS`, recursively visit its user list and find all those + // kernels/functions within which the `LDS` is being accessed. + void createLDSGlobalToAccessorsMap(); + + // For each kernel `K`, collect LDS globals which are being accessed during + // the execution of `K`. + bool collectPerKernelAccessibleLDSGlobals(); + + //===--------------------------------------------------------------------===// + // Methods which aid in creating the various `set` data structures. + //===--------------------------------------------------------------------===// + + // Collect all the LDS globals defined within the current module which require + // pointer replacement. + bool collectLDSGlobals(); + + // Collect all the amdgpu kernels defined within the current module. + bool collectKernels(); +}; + +// Construct an `IntToPtr` instruction which replaces `LDS` within F. +Value *ReplaceLDSUseImpl::getIntToPtrInst(Function *F, GlobalVariable *LDS, + GlobalVariable *LDSPointer) { + // Create an entry for `F` within `FunctionToLDSToInst`. + if (!contains(FunctionToLDSToInst, F)) + FunctionToLDSToInst[F] = std::map(); + + // `IntToPtr` instruction to be constructed. + Value *IToP = nullptr; + + auto &LDSToInst = FunctionToLDSToInst[F]; + if (!contains(LDSToInst, LDS)) { + // Get the instruction insertion point within the beginning of the entry + // block of current non-kernel function. + auto *EI = &(*(F->getEntryBlock().getFirstInsertionPt())); + IRBuilder<> Builder(EI); + + // Insert Load and IntToPtr instructions. + IToP = Builder.CreateIntToPtr( + Builder.CreateLoad(LDSPointer->getValueType(), LDSPointer), + LDS->getType()); + LDSToInst[LDS] = IToP; + } else + IToP = LDSToInst[LDS]; + + return IToP; +} + +// Replace all uses of original LDS globals within all non-kernel functions by +// their respective LDS poitners. +void ReplaceLDSUseImpl::replaceUsesOfLDSGlobalsByPointers() { + for (auto LI = LDSToPointer.begin(), LE = LDSToPointer.end(); LI != LE; + ++LI) { + auto *LDS = LI->first; + auto *LDSPointer = LI->second; + + SmallVector LDSUsers(LDS->users()); + for (auto *U : LDSUsers) { + // `U` is a global variable (from different address space) which got + // initialized with `LDS`. No need to handle it. + if (isUserGlobalVariable(U)) + continue; + + // `U` is from within some function. Since the replacers of `LDS` within + // `U` are instructions, and if `U` is a const expression, then we cannot + // embed instructions within const expressions. Hence, get appropriate + // instructions if `U` is a const expression. + SmallPtrSet Insts; + getInstructions(U, Insts); + + for (auto *I : Insts) { + // Get function to which `I` belongs to. + auto *F = I->getParent()->getParent(); + + // Ignore uses within kernels. + if (AMDGPU::isModuleEntryFunctionCC(F->getCallingConv())) + continue; + + // Construct an `IntToPtr` instruction which replaces `LDS` within F. + auto *IToP = getIntToPtrInst(F, LDS, LDSPointer); + + // Replace the uses of `LDS` within `I` by `IToP`. + updateUserOperand(I, LDS, IToP); + } + } + } +} + +// Insert global LDS pointers (which point to original LDS globals which are +// referenced within non-kernel functions) and initialize them within kernels to +// point to respective LDS globals. +void ReplaceLDSUseImpl::insertAndInitializeLDSPointers() { + unsigned LID = 0; + + for (auto KI = KernelToLDSGlobals.begin(), KE = KernelToLDSGlobals.end(); + KI != KE; ++KI) { + // Get the instruction insertion point within the beginning of entry block + // of current kernel. + auto *EI = &(*(KI->first->getEntryBlock().getFirstInsertionPt())); + IRBuilder<> Builder(EI); + + // Insert and initialize LDS pointers for all LDS globals which associated + // with current kernel. + for (auto *LDS : KI->second) { + GlobalVariable *LDSPointer = nullptr; + + if (!contains(LDSToPointer, LDS)) { + // `LDS` is encountered for first type, create an LDS pointer which is + // suppose to point to `LDS`. + ++LID; + auto *I16Ty = Type::getInt16Ty(Ctx); + LDSPointer = new GlobalVariable( + M, I16Ty, false, GlobalValue::InternalLinkage, + UndefValue::get(I16Ty), + Twine("llvm.amdgcn.lds.pointer.") + Twine(LID), nullptr, + GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); + LDSPointer->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + LDSPointer->setAlignment(getAlign(M.getDataLayout(), LDSPointer)); + LDSToPointer[LDS] = LDSPointer; + } else { + // An LDS pointer which points to `LDS` is already created, get it. + LDSPointer = LDSToPointer[LDS]; + } + + // Insert instructions at `EI` in order to initialize `LDSPointer` to + // point to `LDS`. + Builder.CreateStore(Builder.CreatePtrToInt(LDS, Type::getInt16Ty(Ctx)), + LDSPointer); + } + } +} + +// Associate each kernel K with LDS globals which are being accessed by K and/or +// by the callees of K. +void ReplaceLDSUseImpl::createKernelToLDSGlobalsMap() { + for (auto *K : Kernels) { + SmallPtrSet LDSSet(LDSGlobalsAsInitializers.begin(), + LDSGlobalsAsInitializers.end()); + + // Collect all those LDS globals which are being accessed by the callees of + // kernel K. + if (contains(KernelToCallees, K)) { + for (auto *Callee : KernelToCallees[K]) { + if (contains(AccessorToLDSGlobals, Callee)) + LDSSet.insert(AccessorToLDSGlobals[Callee].begin(), + AccessorToLDSGlobals[Callee].end()); + } + } + + if (!LDSSet.empty()) + KernelToLDSGlobals[K] = LDSSet; + } +} + +// Collect all call graph nodes which are reachable from the node `CGN`. +void ReplaceLDSUseImpl::collectReachableCallGraphNodes( + CallGraphNode *CGN, SetVector &ReachableCGNodes) { + for (scc_iterator I = scc_begin(CGN); !I.isAtEnd(); ++I) { + const std::vector &SCC = *I; + assert(!SCC.empty() && "SCC with no functions?"); + for (auto *CGNode : SCC) + ReachableCGNodes.insert(CGNode); + } +} + +// Resolve all indirect call sites within the the call graph node `CGN`. +void ReplaceLDSUseImpl::resolveIndirectCallSites( + CallGraphNode *CGN, CallGraph &CG, + SmallPtrSetImpl &AddressTakenSet, + SetVector &ReachableCGNodes) { + for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) { + auto *CB = cast(GI->first.getValue()); + + // If the call site `CB` is not an indirect call site, ignore it, and go to + // next one, otherwise, resolve the indirect call site `CB` to a set of + // potential callees. + if (!CB->isIndirectCall()) + continue; + + // "Inline asm call sites" cannot be handled. Ignore it. + if (CB->isInlineAsm()) + continue; + + // `CB` is an indirect call, handle it. + // + if (auto *MD = CB->getMetadata(LLVMContext::MD_callees)) { + // The metadata "!callee" is available at the indirect call site `CB`, + // which means, all the potential target callees for the call site `CB` is + // successfully resolved at compile time. Collect them. + for (const auto &Op : MD->operands()) { + auto *GCN = CG[mdconst::extract_or_null(Op)]; + collectReachableCallGraphNodes(GCN, ReachableCGNodes); + } + } else { + // The metadata "!callee" is *NOT* available at the indirect call site + // `CB`, which means, `CB` has *NO* information about potential target + // callees. The simplest possible *SAFE* assumption that we can make here + // is to consider all those "address taken" functions whose singature + // matches with that of the call site `CB`, and assume that all these + // signature matched "address taken" functions are possible potential + // callees. So, collect all these signature matchable "address taken" + // functions. + auto *CBFTy = CB->getFunctionType(); + for (auto *CGN : AddressTakenSet) { + if (CGN->getFunction()->getFunctionType() == CBFTy) + collectReachableCallGraphNodes(CGN, ReachableCGNodes); + } + } + } +} + +// Traverse `CallGraph` starting from the `CallGraphNode` associated with each +// kernel `K` and collect all the callees which are reachable from K (including +// indirectly called callees). +void ReplaceLDSUseImpl::createKernelToCalleesMap() { + // Create the call graph `CG` of the module `M`, collect all the address taken + // functions, and explore `CG` to collect all the reachable callees (including + // indirectly called callees) from all kernels. + CallGraph CG = CallGraph(M); + + // Holds call graph nodes associated with the functions whose addresses are + // taken within the module. + SmallPtrSet AddressTakenSet; + + // Collect all address taken functions within the module `M`. + collectAddressTakenFunctions(CG, AddressTakenSet); + + for (auto *K : Kernels) { + // Get `CallGraphNode` representing kernel `K`. + auto *KernCGNode = CG[K]; + + // Collect all call graph nodes which are reachable from `KernCGNode`. + SetVector ReachableCGNodes; + collectReachableCallGraphNodes(KernCGNode, ReachableCGNodes); + + // Remove `CallGraphNode` representing kernel `K` from reachable node set. + ReachableCGNodes.remove(KernCGNode); + + // Collect all callees (including potential indirect callees) which are + // reachable from kernel `K`. First, resolve all indirect call sites within + // kernel `K`, and then `recursively` within all reachable callees from + // kernel `K`. + SmallPtrSet ReachableCallees; + SmallPtrSet VisitedCGNodes; + + resolveIndirectCallSites(KernCGNode, CG, AddressTakenSet, ReachableCGNodes); + + while (!ReachableCGNodes.empty()) { + auto *CGN = ReachableCGNodes.pop_back_val(); + + // If `CGN` is already handled OR if there is not callee associated with + // `CGN`, then ignore it. + if (!VisitedCGNodes.insert(CGN).second || !CGN->getFunction()) + continue; + + auto *F = CGN->getFunction(); + + // This pass expects both caller and callee to appear in the same module. + // Report an error if `F` is a non-kernel function and is not definition. + if (!AMDGPU::isModuleEntryFunctionCC(F->getCallingConv()) && + F->isDeclaration()) + reportReplaceLDSError(LLEK_NoCalleeDefinitionError, F); + + // Callee associated with `CGN` is reachable from kernel `K`. + ReachableCallees.insert(F); + + // Resolve all indirect call sites within the callee `Callee`. + resolveIndirectCallSites(CGN, CG, AddressTakenSet, ReachableCGNodes); + } + + KernelToCallees[K] = ReachableCallees; + } +} + +// Associate each kernel/function with the LDS globals which are being accessed +// within them. +void ReplaceLDSUseImpl::createAccessorToLDSGlobalsMap() { + for (auto LI = LDSGlobalToAccessors.begin(), LE = LDSGlobalToAccessors.end(); + LI != LE; ++LI) { + for (auto *A : LI->second) { + if (!contains(AccessorToLDSGlobals, A)) { + SmallPtrSet LDSSet; + LDSSet.insert(LI->first); + AccessorToLDSGlobals[A] = LDSSet; + } else + AccessorToLDSGlobals[A].insert(LI->first); + } + } +} + +// For each `LDS`, recursively visit its user list and find all those +// kernels/functions within which the `LDS` is being accessed. +void ReplaceLDSUseImpl::createLDSGlobalToAccessorsMap() { + for (auto *LDS : LDSGlobals) { + SmallPtrSet LDSAccessors; + SmallVector UserStack(LDS->users()); + SmallPtrSet VisitedUsers; + + while (!UserStack.empty()) { + auto *U = UserStack.pop_back_val(); + + // `U` is already visited? continue to next one. + if (!VisitedUsers.insert(U).second) + continue; + + // `U` is a global variable (from different address space) which is + // initialized with `LDS`. Ignore `U`. + if (isa(U)) { + LDSGlobalsAsInitializers.insert(LDS); + continue; + } + + // `U` is `Constant`. Push-back users of `U`, and continue further + // exploring the stack until an `Instruction` is found. + if (isa(U)) { + append_range(UserStack, U->users()); + continue; + } + + // `U` should be an instruction. Otherwise something is wrong. + auto *I = dyn_cast(U); + if (!I) + reportReplaceLDSError(LLEK_InternalError); + + // We have successfully found a kernel/function within which the `LDS` is + // being accessed, insert it into `LDSAccessors` set. + LDSAccessors.insert(I->getParent()->getParent()); + } + + LDSGlobalToAccessors[LDS] = LDSAccessors; + } +} + +// For each kernel `K`, collect LDS globals which are being accessed during the +// execution of `K`. +bool ReplaceLDSUseImpl::collectPerKernelAccessibleLDSGlobals() { + // Associate each LDS with the kernels/functions within which the LDS is being + // accessed. + createLDSGlobalToAccessorsMap(); + + // Associate each kernel/function with the LDS globals which are being + // accessed within them. + createAccessorToLDSGlobalsMap(); + + // Associate each kernel K with callees which are reachable from K (including + // indirectly called callees). + createKernelToCalleesMap(); + + // Associate each kernel K with LDS globals which are being accessed by K + // and/or by the callees of K. + createKernelToLDSGlobalsMap(); + + // If *none* of the kernels associate with any LDS globals which needs pointer + // replacement, then nothing do. + return !KernelToLDSGlobals.empty(); +} + +// Collect all the (static) LDS globals defined within the current module which +// require pointer replacement. +bool ReplaceLDSUseImpl::collectLDSGlobals() { + SmallPtrSet UsedList = getUsedList(M); + for (auto &GV : M.globals()) { + if (isLDSLowereringRequired(&GV, UsedList)) + LDSGlobals.insert(&GV); + } + + return !LDSGlobals.empty(); +} + +// Collect all the amdgpu kernels defined within the current module. +bool ReplaceLDSUseImpl::collectKernels() { + for (auto &F : M.functions()) { + // Collect `F` if it is a definition of an entry point function. + if (!F.isDeclaration() && + AMDGPU::isModuleEntryFunctionCC(F.getCallingConv())) + Kernels.insert(&F); + } + + return !Kernels.empty(); +} + +// Entry-point function. +bool ReplaceLDSUseImpl::replace() { + // If there are *no* kernels defined within the module, or if there are *no* + // LDS globals defined within the module, then nothing to do. + if (!collectKernels() || !collectLDSGlobals()) + return false; + + // There are kernels and LDS globals defined within the module, but, if none + // of the LDS globals are being accessed within non-kernel functions along the + // run time kernels execution paths, then nonthing to do. + if (!collectPerKernelAccessibleLDSGlobals()) + return false; + + // Insert global LDS pointers (which point to original LDS globals which are + // referenced within non-kernel functions) and initialize them within kernels + // to point to respective LDS globals. + insertAndInitializeLDSPointers(); + + // Replace all uses of original LDS globals within all non-kernel functions by + // their respective LDS poitners. + replaceUsesOfLDSGlobalsByPointers(); + + return true; +} + +class AMDGPUReplaceLDSUseWithPointer : public ModulePass { +public: + static char ID; + + AMDGPUReplaceLDSUseWithPointer() : ModulePass(ID) { + initializeAMDGPUReplaceLDSUseWithPointerPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; +}; + +} // namespace + +char AMDGPUReplaceLDSUseWithPointer::ID = 0; +char &llvm::AMDGPUReplaceLDSUseWithPointerID = + AMDGPUReplaceLDSUseWithPointer::ID; + +INITIALIZE_PASS(AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE, + "Replace non-kernel use of LDS with pointer", + false /*only look at the cfg*/, false /*analysis pass*/) + +bool AMDGPUReplaceLDSUseWithPointer::runOnModule(Module &M) { + ReplaceLDSUseImpl LDSReplacer{M}; + return LDSReplacer.replace(); +} + +ModulePass *llvm::createAMDGPUReplaceLDSUseWithPointerPass() { + return new AMDGPUReplaceLDSUseWithPointer(); +} + +PreservedAnalyses +AMDGPUReplaceLDSUseWithPointerPass::run(Module &M, ModuleAnalysisManager &AM) { + ReplaceLDSUseImpl LDSReplacer{M}; + LDSReplacer.replace(); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -35,6 +35,7 @@ static bool EnableLateStructurizeCFG; static bool EnableFunctionCalls; static bool EnableFixedFunctionABI; + static bool EnableLowerModuleLDS; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -193,10 +193,10 @@ cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden); -static cl::opt - DisableLowerModuleLDS("amdgpu-disable-lower-module-lds", cl::Hidden, - cl::desc("Disable lower module lds pass"), - cl::init(false)); +static cl::opt EnableLowerModuleLDS( + "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), + cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), + cl::Hidden); extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target @@ -265,6 +265,7 @@ initializeGCNRegBankReassignPass(*PR); initializeGCNNSAReassignPass(*PR); initializeSIAddIMGInitPass(*PR); + initializeAMDGPUReplaceLDSUseWithPointerPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -398,6 +399,7 @@ bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; bool AMDGPUTargetMachine::EnableFunctionCalls = false; bool AMDGPUTargetMachine::EnableFixedFunctionABI = false; +bool AMDGPUTargetMachine::EnableLowerModuleLDS = false; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; @@ -512,6 +514,7 @@ return true; } if (PassName == "amdgpu-lower-module-lds") { + PM.addPass(AMDGPUReplaceLDSUseWithPointerPass()); PM.addPass(AMDGPULowerModuleLDSPass()); return true; } @@ -872,7 +875,6 @@ addPass(createAtomicExpandPass()); - addPass(createAMDGPULowerIntrinsicsPass()); // Function calls are not supported, so make sure we inline everything. @@ -893,8 +895,10 @@ addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); // Can increase LDS used by kernel so runs before PromoteAlloca - if (!DisableLowerModuleLDS) + if (EnableLowerModuleLDS) { + addPass(createAMDGPUReplaceLDSUseWithPointerPass()); addPass(createAMDGPULowerModuleLDSPass()); + } if (TM.getOptLevel() > CodeGenOpt::None) { addPass(createInferAddressSpacesPass()); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -81,6 +81,7 @@ AMDGPUPropagateAttributes.cpp AMDGPURegBankCombiner.cpp AMDGPURegisterBankInfo.cpp + AMDGPUReplaceLDSUseWithPointer.cpp AMDGPURewriteOutArguments.cpp AMDGPUSubtarget.cpp AMDGPUTargetMachine.cpp diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUGeneralUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUGeneralUtils.h new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUGeneralUtils.h @@ -0,0 +1,43 @@ +//===- AMDGPUGeneralUtils.h - general helper functions -*- C++ -*----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// AMDGPU target related general helper utility functions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUGENERALUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUGENERALUTILS_H + +#include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "Utils/AMDGPUGeneralUtils.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +namespace llvm { + +// Check if `GV` is an LDS global, and lowerering is required for it. +bool isLDSLowereringRequired(GlobalVariable *GV, + const SmallPtrSetImpl &UsedList, + bool IsLDSLoweringPass = false); + +// Get the required alignment for global variable `GV`. +Align getAlign(const DataLayout &DL, const GlobalVariable *GV); + +// Get a list of all used global values in the module `M`. +SmallPtrSet getUsedList(Module &M); + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUGENERALUTILS_H diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUGeneralUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUGeneralUtils.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUGeneralUtils.cpp @@ -0,0 +1,168 @@ +//===- AMDGPUGeneralUtils.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// AMDGPU target related general helper utility functions. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUGeneralUtils.h" + +using namespace llvm; + +namespace llvm { + +// Check if we can skip the lowering for current LDS global `GV`. +static bool skipLowering(GlobalVariable *GV, + const SmallPtrSetImpl &UsedList, + bool IsLDSLoweringPass) { + bool UsedAsInitializer = false; + bool UsedAsNonLLVMUsedInitializer = false; + bool UsedWithinKernelFunction = false; + bool UsedWithinNonKernelFunction = false; + SmallPtrSet VisitedUsers; + + // There are no users for `GV`, skip lowering for `GV`. + SmallVector UserStack(GV->users()); + if (UserStack.empty()) + return true; + + while (!UserStack.empty()) { + auto *U = UserStack.pop_back_val(); + + // `U` is already visited? continue to next one. + if (!VisitedUsers.insert(U).second) + continue; + + if (isa(U)) { + // `U` is a global variable, and `GV` is used as its initializer. + UsedAsInitializer = true; + if (!UsedList.contains(GV)) { + // Used as initializer of normal globals apart from "llvm.used" or + // "llvm.compiler.used". + UsedAsNonLLVMUsedInitializer = true; + } + continue; + } + + if (isa(U)) { + // `U` is `Constant`. Push-back users of `U`, and continue further + // exploring the stack. + append_range(UserStack, U->users()); + continue; + } + + // `U` should be an instruction belonging to some function. + auto *I = dyn_cast(U); + assert(I && "Instruction expected."); + auto *F = I->getFunction(); + if (AMDGPU::isModuleEntryFunctionCC(F->getCallingConv())) { + UsedWithinKernelFunction = true; + } else { + UsedWithinNonKernelFunction = true; + } + } + + if (UsedWithinNonKernelFunction) { + // `GV` is used within non-kernel function, it requires lowering. + return false; + } + + if (UsedAsInitializer) { + // `GV` is used as an initializer of some global variable. + if (UsedAsNonLLVMUsedInitializer) { + // `GV` is used as global variable initializer of normal globals. + if (!IsLDSLoweringPass) { + // This is "LDS replace with pointer" pass, and let this pass make sure + // that pointer variable is created for `GV`, and that pointer variable + // is initialized with `GV` within all kernels. + return false; + } else { + // "LDS replace with pointer" pass makes sure that a pointer variable is + // created for `GV`, and it is initialized with `GV` within all kernels, + // and which means that per kernel specific `GV` will be created, and + // hence "LDS lowering pass" no need to touch it. + return true; + } + } else { + // `GV` is used as global variable initializer of "llvm.used" or + // "llvm.compiler.used". Ignore lowering. + return true; + } + } + + if (UsedWithinKernelFunction) { + // `GV` is *only* used within kernel, it does not require lowering. + return true; + } + + // Ideally control should not reach here. If it is, then, we need to take a + // re-look at the above logic. + assert(false && "Internal error."); + return true; +} + +// Check if `GV` is an LDS global, and lowerering is required for it. +bool isLDSLowereringRequired(GlobalVariable *GV, + const SmallPtrSetImpl &UsedList, + bool IsLDSLoweringPass) { + if (GV->getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { + // Ignore addrspace other than 3. + return false; + } + + if (!GV->hasInitializer()) { + // addrspace(3) without initializer implies cuda/hip extern __shared__ the + // semantics for such a variable appears to be that all extern // __shared__ + // variables alias one another, in which case this transform is not required + return false; + } + + if (!isa(GV->getInitializer())) { + // Initializers are unimplemented for local address space. Leave such + // variables in place for consistent error reporting. + return false; + } + + if (GV->isConstant()) { + // A constant undef variable can't be written to, and any load is undef, so + // it should be eliminated by the optimizer. It could be dropped by the back + // end if not. This pass skips over it. + return false; + } + + if (skipLowering(GV, UsedList, IsLDSLoweringPass)) { + // We can safely ignore the users of GV, hence lowering of GV is not + // required. + return false; + } + + return true; +} + +// Get the required alignment for global variable `GV`. +Align getAlign(const DataLayout &DL, const GlobalVariable *GV) { + return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL), + GV->getValueType()); +} + +// Get a list of all used global values in the module `M`. +SmallPtrSet getUsedList(Module &M) { + SmallPtrSet UsedList; + + SmallVector TmpVec; + collectUsedGlobalVariables(M, TmpVec, true); + UsedList.insert(TmpVec.begin(), TmpVec.end()); + + TmpVec.clear(); + collectUsedGlobalVariables(M, TmpVec, false); + UsedList.insert(TmpVec.begin(), TmpVec.end()); + + return UsedList; +} + +} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt @@ -2,6 +2,7 @@ AMDGPUBaseInfo.cpp AMDKernelCodeTUtils.cpp AMDGPUAsmUtils.cpp + AMDGPUGeneralUtils.cpp AMDGPUPALMetadata.cpp LINK_COMPONENTS diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-disable-lower-module-lds=true -o - %s 2> %t | FileCheck --check-prefix=GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-enable-lower-module-lds=false -o - %s 2> %t | FileCheck --check-prefix=GFX8 %s ; RUN: FileCheck -check-prefix=ERR %s < %t -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-disable-lower-module-lds=true -o - %s 2> %t | FileCheck --check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-lower-module-lds=false -o - %s 2> %t | FileCheck --check-prefix=GFX9 %s ; RUN: FileCheck -check-prefix=ERR %s < %t @lds = internal addrspace(3) global float undef, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll @@ -1,4 +1,4 @@ -; RUN: not --crash llc -march=amdgcn -verify-machineinstrs -amdgpu-disable-lower-module-lds=true < %s 2>&1 | FileCheck -check-prefix=ERROR %s +; RUN: not --crash llc -march=amdgcn -verify-machineinstrs -amdgpu-enable-lower-module-lds=false < %s 2>&1 | FileCheck -check-prefix=ERROR %s ; ERROR: LLVM ERROR: Unsupported expression in static initializer: addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*) diff --git a/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll b/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll --- a/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll +++ b/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-function-calls -amdgpu-stress-function-calls < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-function-calls -amdgpu-stress-function-calls -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=GCN %s @lds0 = addrspace(3) global i32 undef, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll b/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll --- a/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll +++ b/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll @@ -1,7 +1,7 @@ -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-always-inline %s | FileCheck --check-prefix=ALL %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-always-inline %s | FileCheck --check-prefix=ALL %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls -amdgpu-always-inline %s | FileCheck --check-prefix=ALL %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls -passes=amdgpu-always-inline %s | FileCheck --check-prefix=ALL %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-always-inline -amdgpu-enable-lower-module-lds=false %s | FileCheck --check-prefix=ALL %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-always-inline -amdgpu-enable-lower-module-lds=false %s | FileCheck --check-prefix=ALL %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls -amdgpu-always-inline -amdgpu-enable-lower-module-lds=false %s | FileCheck --check-prefix=ALL %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls -passes=amdgpu-always-inline -amdgpu-enable-lower-module-lds=false %s | FileCheck --check-prefix=ALL %s target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll --- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - -amdgpu-disable-lower-module-lds=true %s 2> %t | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - -amdgpu-enable-lower-module-lds=false %s 2> %t | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: FileCheck -check-prefix=ERR %s < %t -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - -amdgpu-disable-lower-module-lds=true %s 2> %t | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - -amdgpu-enable-lower-module-lds=false %s 2> %t | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: FileCheck -check-prefix=ERR %s < %t @lds = internal addrspace(3) global float undef, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll @@ -1,6 +1,10 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; The pass - `amdgpu-lower-module-lds` should be run with its prerequisite pass `amdgpu-replace-lds-use-with-pointer` ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s +; The LDS globals @func and @both are used within non-kernel functions, but they are not called from kernel. Hence the +; pass `amdgpu-replace-lds-use-with-pointer` has become help-less here, thus, the pass `amdgpu-lower-module-lds` results +; in creating `float` members. But, this is only true in this test case, in reality those non-called functions, and +; hence LDS globals referenced within them would have eliminated as not used globals. ; CHECK: %llvm.amdgcn.module.lds.t = type { float, float } @func = addrspace(3) global float undef, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; The pass - `amdgpu-lower-module-lds` should be run with its prerequisite pass `amdgpu-replace-lds-use-with-pointer` ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s ; Variables that are not lowered by this pass are left unchanged diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll @@ -1,22 +1,33 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; The pass - `amdgpu-lower-module-lds` should be run with its prerequisite pass `amdgpu-replace-lds-use-with-pointer` ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s -; CHECK: %llvm.amdgcn.module.lds.t = type { double, float } - -; CHECK: @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to float*), align 8 - -; CHECK: @kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* null to double*), align 8 - -; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8 - +; The original LDS globals - `@function_target` and `@kernel_target` are used as initializers of globals - +; `@function_indirect` and `@kernel_indirect`, and they are not referenced directly anywhere else. The pass - +; `amdgpu-replace-lds-use-with-pointer` makes sure that they are referenced within all kernels by assigning +; their addresses to respective pointers within all kernels, and hence global initialization of `@function_indirect` +; and `@kernel_indirect` are taken care. Hence, the pass - `amdgpu-lower-module-lds` does not do any further +; lowering here. +; CHECK-NOT: %llvm.amdgcn.module.lds.t + +; Original globals left unchanged. +; CHECK: @function_target = addrspace(3) global float undef, align 4 +; CHECK: @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* @function_target to float*), align 8 +; CHECK: @kernel_target = addrspace(3) global double undef, align 8 +; CHECK: @kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* @kernel_target to double*), align 8 @function_target = addrspace(3) global float undef, align 4 @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* @function_target to float*), align 8 - @kernel_target = addrspace(3) global double undef, align 8 @kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* @kernel_target to double*), align 8 -; CHECK-LABEL: @function(float %x) +; New pointers introduced by the pass - `amdgpu-replace-lds-use-with-pointer` +; CHECK: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; No changes to function - @function +; CHECK-LABEL: entry: ; CHECK: %0 = load float*, float* addrspace(1)* @function_indirect, align 8 +; CHECK: store float %x, float* %0, align 4 +; CHECK: ret void define void @function(float %x) local_unnamed_addr #5 { entry: %0 = load float*, float* addrspace(1)* @function_indirect, align 8 @@ -24,16 +35,19 @@ ret void } -; CHECK-LABEL: @kernel(double %x) -; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] -; CHECK: %0 = load double*, double* addrspace(1)* @kernel_indirect, align 8 +; The LDS globals @function_target and @kernel_target are referenced within kernel by initializing them +; to respective pointers. +; CHECK-LABEL: entry: +; CHECK: %{{[0-9]+}} = ptrtoint {{[a-z]+}} addrspace(3)* @{{[a-z]+}}_target to i16 +; CHECK: store i16 %{{[0-9]+}}, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[12]}}, align 2 +; CHECK: %{{[0-9]+}} = ptrtoint {{[a-z]+}} addrspace(3)* @{{[a-z]+}}_target to i16 +; CHECK: store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[12]}}, align 2 +; CHECK: %{{[0-9]+}} = load double*, double* addrspace(1)* @kernel_indirect, align 8 +; CHECK: store double %x, double* %{{[0-9]+}}, align 8 +; CHECK: ret void define amdgpu_kernel void @kernel(double %x) local_unnamed_addr #5 { entry: %0 = load double*, double* addrspace(1)* @kernel_indirect, align 8 store double %x, double* %0, align 8 ret void } - - - - diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inline-asm-call.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inline-asm-call.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inline-asm-call.ll @@ -0,0 +1,31 @@ +; The pass - `amdgpu-lower-module-lds` should be run with its prerequisite pass `amdgpu-replace-lds-use-with-pointer` +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; The pass - `amdgpu-replace-lds-use-with-pointer` cannot handle inline asm calls, and hence the +; pass - `amdgpu-lower-module-lds` need to handle LDS global @func. +; CHECK: %llvm.amdgcn.module.lds.t = type { i32 } + +; @func is only used from a non-kernel function so is rewritten +; CHECK-NOT: @func +@func = addrspace(3) global i32 undef, align 4 + +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4 +; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata" + +; CHECK-LABEL: @function() +; CHECK: %0 = load i32, i32 addrspace(3)* null, align 4 +; CHECK: ret i32 %0 +define i32 @function() local_unnamed_addr { +entry: + %0 = load i32, i32 addrspace(3)* @func, align 4 + ret i32 %0 +} + +; CHECK-LABEL: @kernel() +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: call void asm sideeffect "", "~{v23}"() +; CHECK: ret void +define amdgpu_kernel void @kernel() { + call void asm sideeffect "", "~{v23}"() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll @@ -1,8 +1,10 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; The pass - `amdgpu-lower-module-lds` should be run with its prerequisite pass `amdgpu-replace-lds-use-with-pointer` ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s -; Check new struct is added to compiler.used and that the replaced variable is removed - +; The LDS global @tolower is used within non-kernel function, but it is not called from kernel. Hence the +; pass `amdgpu-replace-lds-use-with-pointer` has become help-less here, thus, the pass `amdgpu-lower-module-lds` +; results in creating `float` member. But, this is only true in this test case, in reality this non-called +; function, and hence the LDS global referenced within it would have eliminated as not used globals. ; CHECK: %llvm.amdgcn.module.lds.t = type { float } ; CHECK: @ignored = addrspace(1) global i64 0 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll @@ -1,33 +1,43 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; The pass - `amdgpu-lower-module-lds` should be run with its prerequisite pass `amdgpu-replace-lds-use-with-pointer` ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s -; Padding to meet alignment, so references to @var1 replaced with gep ptr, 0, 2 -; No i64 as addrspace(3) types with initializers are ignored. Likewise no addrspace(4). -; CHECK: %llvm.amdgcn.module.lds.t = type { float, [4 x i8], i32 } +; The LDS glboals @var0 and @var1 are replaced by pointers, and hence two i16 members. +; CHECK: %llvm.amdgcn.module.lds.t = type { i16, i16 } -; Variables removed by pass -; CHECK-NOT: @var0 -; CHECK-NOT: @var1 +; Orignal LDS globals +; CHECK: @var0 = addrspace(3) global float undef, align 8 +; CHECK: @var1 = addrspace(3) global i32 undef, align 8 @var0 = addrspace(3) global float undef, align 8 @var1 = addrspace(3) global i32 undef, align 8 -@ptr = addrspace(1) global i32 addrspace(3)* @var1, align 4 +; Initializer @var1 is left untouched +; CHECK: @ptr = addrspace(1) global i32 addrspace(3)* @var1, align 4 +@ptr = addrspace(1) global i32 addrspace(3)* @var1, align 4 -; A variable that is unchanged by pass +; A variable that is untouched by pass because of wrong initialization to LDS global ; CHECK: @with_init = addrspace(3) global i64 0 @with_init = addrspace(3) global i64 0 +; The two i16 pointers which are introduced by the pass `amdgpu-replace-lds-use-with-pointer` should be removed by the pass `amdgpu-lower-module-lds`. +; CHECK-NOT: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK-NOT: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2 + ; Instance of new type, aligned to max of element alignment -; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8 +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 2 ; Use in func rewritten to access struct at address zero, which prints as null ; CHECK-LABEL: @func() -; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0 -; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4 +; CHECK: %1 = load i16, i16 addrspace(3)* +; CHECK: %2 = inttoptr i16 %1 to {{[a-z0-9]+}} addrspace(3)* +; CHECK: %3 = load i16, i16 addrspace(3)* +; CHECK: %4 = inttoptr i16 %3 to {{[a-z0-9]+}} addrspace(3)* +; CHECK: %dec = atomicrmw fsub float addrspace(3)* %{{[0-9]+}}, float 1.000000e+00 monotonic, align 4 +; CHECK: %val0 = load i32, i32 addrspace(3)* %{{[0-9]+}}, align 4 ; CHECK: %val1 = add i32 %val0, 4 -; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4 -; CHECK: %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic +; CHECK: store i32 %val1, i32 addrspace(3)* %{{[0-9]+}}, align 4 +; CHECK: %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic, align 8 +; CHECK: ret void define void @func() { %dec = atomicrmw fsub float addrspace(3)* @var0, float 1.0 monotonic %val0 = load i32, i32 addrspace(3)* @var1, align 4 @@ -40,17 +50,25 @@ ; This kernel calls a function that uses LDS so needs the block ; CHECK-LABEL: @kern_call() ; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: %1 = ptrtoint {{[a-z0-9]+}} addrspace(3)* @var{{[01]}} to i16 +; CHECK: store i16 %1, i16 addrspace(3)* +; CHECK: %2 = ptrtoint {{[a-z0-9]+}} addrspace(3)* @var{{[01]}} to i16 +; CHECK: store i16 %2, i16 addrspace(3)* ; CHECK: call void @func() -; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 2.0 +; CHECK: %dec = atomicrmw fsub float addrspace(3)* @var0, float 2.000000e+00 monotonic, align 4 +; CHECK: ret void define amdgpu_kernel void @kern_call() { call void @func() %dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic ret void } -; This kernel does not need to alloc the LDS block as it makes no calls +; Though the kernel does not make call, because @var1 is used as initializer, it still need to alloc the LDS block. ; CHECK-LABEL: @kern_empty() ; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: %1 = ptrtoint i32 addrspace(3)* @var1 to i16 +; CHECK: store i16 %1, i16 addrspace(3)* +; CHECK: ret void define spir_kernel void @kern_empty() { ret void } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll @@ -1,5 +1,5 @@ ; RUN: opt -S -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s -; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-disable-lower-module-lds=true < %s | FileCheck -check-prefix=ASM %s +; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=ASM %s target datalayout = "A5" diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_report_error_no_func_def.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_report_error_no_func_def.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace_lds_report_error_no_func_def.ll @@ -0,0 +1,20 @@ +; RUN: not --crash opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s 2>&1 | FileCheck -check-prefix=ERROR %s + +; ERROR: LLVM ERROR: The pass "Replace LDS Use With Pointer" assumes that the definitions of both caller and callee appear within same module. But, definition for the callee "callee_1" not available. + +@lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 16 + +declare hidden void @callee_1() local_unnamed_addr + +define internal void @callee_2() { +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @kernel_1() { +entry: + call void @callee_1() + call void @callee_2() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_direct_call_diamond_shape.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_direct_call_diamond_shape.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_direct_call_diamond_shape.ll @@ -0,0 +1,63 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s + +; LDS: @lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4 +; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +@lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4 + +define internal void @function_4() { +; GCN-LABEL: entry: +; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.1, align 2 +; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)* +; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0 + ret void +} + +define internal void @function_3() { +; GCN-LABEL: entry: +; GCN: call void @function_4() +; GCN-NEXT: ret void +entry: + call void @function_4() + ret void +} + +define internal void @function_2() { +; GCN-LABEL: entry: +; GCN: call void @function_4() +; GCN-NEXT: ret void +entry: + call void @function_4() + ret void +} + +define internal void @function_1() { +; GCN-LABEL: entry: +; GCN: call void @function_2() +; GCN: call void @function_3() +; GCN-NEXT: ret void +entry: + call void @function_2() + call void @function_3() + ret void +} + +define protected amdgpu_kernel void @kernel_1() { +; GCN-LABEL: entry: +; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_1 to i16 +; GCN-NEXT: store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.1, align 2 +; GCN-NEXT: call void @function_1() +; GCN-NEXT: ret void +entry: + call void @function_1() + ret void +} + +define protected amdgpu_kernel void @kernel_2() { +; GCN-LABEL: entry: +; GCN: ret void +entry: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_direct_call_misc.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_direct_call_misc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_direct_call_misc.ll @@ -0,0 +1,89 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s + +; LDS: @lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4 +; LDS: @lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4 +; LDS: @lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4 +; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER: @llvm.amdgcn.lds.pointer.3 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +@lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4 + +define internal void @function_3() { +; GCN-LABEL: entry: +; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)* +; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_3, i32 0, i32 0 + ret void +} + +define internal void @function_2() { +; GCN-LABEL: entry: +; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)* +; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_2, i32 0, i32 0 + ret void +} + +define internal void @function_1() { +; GCN-LABEL: entry: +; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)* +; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @kernel_3() { +; GCN-LABEL: entry: +; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[13]}} to i16 +; GCN-NEXT: store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[13]}} to i16 +; GCN-NEXT: store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: call void @function_3() +; GCN-NEXT: call void @function_1() +; GCN-NEXT: ret void +entry: + call void @function_3() + call void @function_1() + ret void +} + +define protected amdgpu_kernel void @kernel_2() { +; GCN-LABEL: entry: +; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[23]}} to i16 +; GCN-NEXT: store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[23]}} to i16 +; GCN-NEXT: store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: call void @function_2() +; GCN-NEXT: call void @function_3() +; GCN-NEXT: ret void +entry: + call void @function_2() + call void @function_3() + ret void +} + +define protected amdgpu_kernel void @kernel_1() { +; GCN-LABEL: entry: +; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[12]}} to i16 +; GCN-NEXT: store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[12]}} to i16 +; GCN-NEXT: store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: call void @function_1() +; GCN-NEXT: call void @function_2() +; GCN-NEXT: ret void +entry: + call void @function_1() + call void @function_2() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_ignored_lds.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_ignored_lds.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_ignored_lds.ll @@ -0,0 +1,80 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s + +; Ignore LDS lds_global_1 because it is dynamic lds. +; Ignore LDS lds_global_2 because it is used only within kernel. +; Ignore LDS lds_global_3 because it is used within nowhere called non-kernel function. +; Ignore LDS lds_global_4 because it is used within non-kernel function but is not reachable due to inline asm call. + +; LDS: @lds_global_1 = external addrspace(3) global [0 x i32], align 4 +; LDS: @lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4 +; LDS: @lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4 +; LDS: @lds_global_4 = internal addrspace(3) global [1 x i32] undef, align 4 +; POINTER-NOT: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER-NOT: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER-NOT: @llvm.amdgcn.lds.pointer.3 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER-NOT: @llvm.amdgcn.lds.pointer.4 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +@lds_global_1 = external addrspace(3) global [0 x i32], align 4 +@lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds_global_4 = internal addrspace(3) global [1 x i32] undef, align 4 + +define internal void @function_1() { +; GCN-LABEL: entry: +; GCN: %gep = getelementptr inbounds [0 x i32], [0 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [0 x i32], [0 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @kernel_1() { +; GCN-LABEL: entry: +; GCN: call void @function_1() +; GCN-NEXT: ret void +entry: + call void @function_1() + ret void +} + +define protected amdgpu_kernel void @kernel_2() { +; GCN-LABEL: entry: +; GCN: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_2, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_2, i32 0, i32 0 + ret void +} + +define internal void @function_3() { +; GCN-LABEL: entry: +; GCN: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_3, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_3, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @kernel_3() { +; GCN-LABEL: entry: +; GCN: ret void +entry: + ret void +} + +define internal void @function_4() { +; GCN-LABEL: entry: +; GCN: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_4, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_4, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @kernel_4() { +; GCN-LABEL: entry: +; GCN: call void asm sideeffect "", "~{v23}"() +; GCN-NEXT: ret void +entry: + call void asm sideeffect "", "~{v23}"() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_diamond_shape.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_diamond_shape.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_diamond_shape.ll @@ -0,0 +1,81 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=FPTR,LDS,POINTER,GCN %s + +; FPTR: @gv.fptr4 = internal local_unnamed_addr externally_initialized global void ()* @function_4, align 8 +; FPTR: @gv.fptr3 = internal local_unnamed_addr externally_initialized global void ()* @function_3, align 8 +; FPTR: @gv.fptr2 = internal local_unnamed_addr externally_initialized global void ()* @function_2, align 8 +; FPTR: @gv.fptr1 = internal local_unnamed_addr externally_initialized global void ()* @function_1, align 8 +; LDS: @lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4 +; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +@gv.fptr4 = internal local_unnamed_addr externally_initialized global void ()* @function_4, align 8 +@gv.fptr3 = internal local_unnamed_addr externally_initialized global void ()* @function_3, align 8 +@gv.fptr2 = internal local_unnamed_addr externally_initialized global void ()* @function_2, align 8 +@gv.fptr1 = internal local_unnamed_addr externally_initialized global void ()* @function_1, align 8 +@lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4 + +define internal void @function_4() { +; GCN-LABEL: entry: +; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.1, align 2 +; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)* +; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0 + ret void +} + +define internal void @function_3() { +; GCN-LABEL: entry: +; GCN: %fptr4 = load void ()*, void ()** @gv.fptr4, align 8 +; GCN-NEXT: call void %fptr4() +; GCN-NEXT: ret void +entry: + %fptr4 = load void ()*, void ()** @gv.fptr4, align 8 + call void %fptr4() + ret void +} + +define internal void @function_2() { +; GCN-LABEL: entry: +; GCN: %fptr4 = load void ()*, void ()** @gv.fptr4, align 8 +; GCN-NEXT: call void %fptr4() +; GCN-NEXT: ret void +entry: + %fptr4 = load void ()*, void ()** @gv.fptr4, align 8 + call void %fptr4() + ret void +} + +define internal void @function_1() { +; GCN-LABEL: entry: +; GCN: %fptr2 = load void ()*, void ()** @gv.fptr2, align 8 +; GCN-NEXT: %fptr3 = load void ()*, void ()** @gv.fptr3, align 8 +; GCN-NEXT: call void %fptr2() +; GCN-NEXT: call void %fptr3() +; GCN-NEXT: ret void +entry: + %fptr2 = load void ()*, void ()** @gv.fptr2, align 8 + %fptr3 = load void ()*, void ()** @gv.fptr3, align 8 + call void %fptr2() + call void %fptr3() + ret void +} + +define protected amdgpu_kernel void @kernel_1() { +; GCN-LABEL: entry: +; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_1 to i16 +; GCN-NEXT: store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.1, align 2 +; GCN-NEXT: %fptr1 = load void ()*, void ()** @gv.fptr1, align 8 +; GCN-NEXT: call void %fptr1() +; GCN-NEXT: ret void +entry: + %fptr1 = load void ()*, void ()** @gv.fptr1, align 8 + call void %fptr1() + ret void +} + +define protected amdgpu_kernel void @kernel_2() { +; GCN-LABEL: entry: +; GCN: ret void +entry: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_misc.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_misc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_misc.ll @@ -0,0 +1,113 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s + +; FPTR: @gv.fptr3 = internal local_unnamed_addr externally_initialized global void ()* @function_3, align 8 +; FPTR: @gv.fptr2 = internal local_unnamed_addr externally_initialized global void ()* @function_2, align 8 +; FPTR: @gv.fptr1 = internal local_unnamed_addr externally_initialized global void ()* @function_1, align 8 +; LDS: @lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4 +; LDS: @lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4 +; LDS: @lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4 +; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER: @llvm.amdgcn.lds.pointer.3 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +@gv.fptr3 = internal local_unnamed_addr externally_initialized global void ()* @function_3, align 8 +@gv.fptr2 = internal local_unnamed_addr externally_initialized global void ()* @function_2, align 8 +@gv.fptr1 = internal local_unnamed_addr externally_initialized global void ()* @function_1, align 8 +@lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4 + +define internal void @function_3() { +; GCN-LABEL: entry: +; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)* +; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_3, i32 0, i32 0 + ret void +} + +define internal void @function_2() { +; GCN-LABEL: entry: +; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)* +; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_2, i32 0, i32 0 + ret void +} + +define internal void @function_1() { +; GCN-LABEL: entry: +; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)* +; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @kernel_3() { +; GCN-LABEL: entry: +; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %2 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %2, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %fptr3 = load void ()*, void ()** @gv.fptr3, align 8 +; GCN-NEXT: %fptr1 = load void ()*, void ()** @gv.fptr1, align 8 +; GCN-NEXT: call void %fptr3() +; GCN-NEXT: call void %fptr1() +; GCN-NEXT: ret void +entry: + %fptr3 = load void ()*, void ()** @gv.fptr3, align 8 + %fptr1 = load void ()*, void ()** @gv.fptr1, align 8 + call void %fptr3() + call void %fptr1() + ret void +} + +define protected amdgpu_kernel void @kernel_2() { +; GCN-LABEL: entry: +; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %2 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %2, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %fptr2 = load void ()*, void ()** @gv.fptr2, align 8 +; GCN-NEXT: %fptr3 = load void ()*, void ()** @gv.fptr3, align 8 +; GCN-NEXT: call void %fptr2() +; GCN-NEXT: call void %fptr3() +; GCN-NEXT: ret void +entry: + %fptr2 = load void ()*, void ()** @gv.fptr2, align 8 + %fptr3 = load void ()*, void ()** @gv.fptr3, align 8 + call void %fptr2() + call void %fptr3() + ret void +} + +define protected amdgpu_kernel void @kernel_1() { +; GCN-LABEL: entry: +; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %2 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %2, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %fptr1 = load void ()*, void ()** @gv.fptr1, align 8 +; GCN-NEXT: %fptr2 = load void ()*, void ()** @gv.fptr2, align 8 +; GCN-NEXT: call void %fptr1() +; GCN-NEXT: call void %fptr2() +; GCN-NEXT: ret void +entry: + %fptr1 = load void ()*, void ()** @gv.fptr1, align 8 + %fptr2 = load void ()*, void ()** @gv.fptr2, align 8 + call void %fptr1() + call void %fptr2() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_misc2.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_misc2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_misc2.ll @@ -0,0 +1,128 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s + +; LDS: @lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4 +; LDS: @lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4 +; LDS: @lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4 +; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER: @llvm.amdgcn.lds.pointer.3 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +@lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4 + +define internal void ()* @return_function_3() { +; GCN-LABEL: entry: +; GCN: ret void ()* @function_3 +entry: + ret void ()* @function_3 +} + +define internal void ()* @return_function_2() { +; GCN-LABEL: entry: +; GCN: ret void ()* @function_2 +entry: + ret void ()* @function_2 +} + +define internal void ()* @return_function_1() { +; GCN-LABEL: entry: +; GCN: ret void ()* @function_1 +entry: + ret void ()* @function_1 +} + +define internal void @function_3() { +; GCN-LABEL: entry: +; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)* +; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_3, i32 0, i32 0 + ret void +} + +define internal void @function_2() { +; GCN-LABEL: entry: +; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)* +; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_2, i32 0, i32 0 + ret void +} + +define internal void @function_1() { +; GCN-LABEL: entry: +; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)* +; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @kernel_3() { +; GCN-LABEL: entry: +; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %2 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %2, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %fptr3 = call void ()* @return_function_3() +; GCN-NEXT: %fptr1 = call void ()* @return_function_1() +; GCN-NEXT: call void %fptr3() +; GCN-NEXT: call void %fptr1() +; GCN-NEXT: ret void +entry: + %fptr3 = call void ()* @return_function_3() + %fptr1 = call void ()* @return_function_1() + call void %fptr3() + call void %fptr1() + ret void +} + +define protected amdgpu_kernel void @kernel_2() { +; GCN-LABEL: entry: +; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %2 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %2, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %fptr2 = call void ()* @return_function_2() +; GCN-NEXT: %fptr3 = call void ()* @return_function_3() +; GCN-NEXT: call void %fptr2() +; GCN-NEXT: call void %fptr3() +; GCN-NEXT: ret void +entry: + %fptr2 = call void ()* @return_function_2() + %fptr3 = call void ()* @return_function_3() + call void %fptr2() + call void %fptr3() + ret void +} + +define protected amdgpu_kernel void @kernel_1() { +; GCN-LABEL: entry: +; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %2 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %2, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %fptr1 = call void ()* @return_function_1() +; GCN-NEXT: %fptr2 = call void ()* @return_function_2() +; GCN-NEXT: call void %fptr1() +; GCN-NEXT: call void %fptr2() +; GCN-NEXT: ret void +entry: + %fptr1 = call void ()* @return_function_1() + %fptr2 = call void ()* @return_function_2() + call void %fptr1() + call void %fptr2() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_no_addr_taken.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_no_addr_taken.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_no_addr_taken.ll @@ -0,0 +1,77 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s + +; LDS: @lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4 +; LDS: @lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4 +; LDS: @lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4 +; POINTER-NOT: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER-NOT: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER-NOT: @llvm.amdgcn.lds.pointer.3 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +@lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4 + +define internal void @function_3() { +; GCN-LABEL: entry: +; GCN: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_3, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_3, i32 0, i32 0 + ret void +} + +define internal void @function_2() { +; GCN-LABEL: entry: +; GCN: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_2, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_2, i32 0, i32 0 + ret void +} + +define internal void @function_1() { +; GCN-LABEL: entry: +; GCN: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @kernel_3() { +; GCN-LABEL: entry: +; GCN: %alloca = alloca void ()* +; GCN-NEXT: %fptr = load void ()*, void ()** %alloca +; GCN-NEXT: call void %fptr() +; GCN-NEXT: ret void +entry: + %alloca = alloca void ()* + %fptr = load void ()*, void ()** %alloca + call void %fptr() + ret void +} + +define protected amdgpu_kernel void @kernel_2() { +; GCN-LABEL: entry: +; GCN: %alloca = alloca void ()* +; GCN-NEXT: %fptr = load void ()*, void ()** %alloca +; GCN-NEXT: call void %fptr() +; GCN-NEXT: ret void +entry: + %alloca = alloca void ()* + %fptr = load void ()*, void ()** %alloca + call void %fptr() + ret void +} + +define protected amdgpu_kernel void @kernel_1() { +; GCN-LABEL: entry: +; GCN: %alloca = alloca void ()* +; GCN-NEXT: %fptr = load void ()*, void ()** %alloca +; GCN-NEXT: call void %fptr() +; GCN-NEXT: ret void +entry: + %alloca = alloca void ()* + %fptr = load void ()*, void ()** %alloca + call void %fptr() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_no_init.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_no_init.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_no_init.ll @@ -0,0 +1,69 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s + +; FPTR: @gv.fptr3 = internal local_unnamed_addr externally_initialized global void ()* @function_3, align 8 +; FPTR: @gv.fptr2 = internal local_unnamed_addr externally_initialized global void ()* @function_2, align 8 +; FPTR: @gv.fptr1 = internal local_unnamed_addr externally_initialized global void ()* @function_1, align 8 +; LDS: @lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4 +; LDS: @lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4 +; LDS: @lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4 +; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER: @llvm.amdgcn.lds.pointer.3 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +@gv.fptr3 = internal local_unnamed_addr externally_initialized global void ()* @function_3, align 8 +@gv.fptr2 = internal local_unnamed_addr externally_initialized global void ()* @function_2, align 8 +@gv.fptr1 = internal local_unnamed_addr externally_initialized global void ()* @function_1, align 8 +@lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4 + +define internal void @function_3() { +; GCN-LABEL: entry: +; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)* +; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_3, i32 0, i32 0 + ret void +} + +define internal void @function_2() { +; GCN-LABEL: entry: +; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)* +; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_2, i32 0, i32 0 + ret void +} + +define internal void @function_1() { +; GCN-LABEL: entry: +; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)* +; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @kernel_1() { +; GCN-LABEL: entry: +; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %2 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16 +; GCN-NEXT: store i16 %2, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %alloca = alloca void ()* +; GCN-NEXT: %fptr = load void ()*, void ()** %alloca +; GCN-NEXT: call void %fptr() +; GCN-NEXT: ret void +entry: + %alloca = alloca void ()* + %fptr = load void ()*, void ()** %alloca + call void %fptr() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_llvm_insts.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_llvm_insts.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_llvm_insts.ll @@ -0,0 +1,32 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s + +; LDS: @smem_ptr = hidden addrspace(3) global i32* undef, align 8 +; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +@smem_ptr = hidden addrspace(3) global i32* undef, align 8 + +define internal void @function_1() { +; GCN-LABEL: entry: +; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.1, align 2 +; GCN-NEXT: %1 = inttoptr i16 %0 to i32* addrspace(3)* +; GCN-NEXT: %2 = addrspacecast i32* addrspace(3)* %1 to i32** +; GCN-NEXT: %ptr = load i32*, i32** %2, align 8 +; GCN-NEXT: %res1 = atomicrmw add i32* %ptr, i32 8 acquire, align 4 +; GCN-NEXT: %res2 = cmpxchg i32* %ptr, i32 8, i32 16 acq_rel monotonic, align 4 +; GCN-NEXT: ret void +entry: + %ptr = load i32*, i32** addrspacecast (i32* addrspace(3)* @smem_ptr to i32**), align 8 + %res1 = atomicrmw add i32* %ptr, i32 8 acquire + %res2 = cmpxchg i32* %ptr, i32 8, i32 16 acq_rel monotonic + ret void +} + +define protected amdgpu_kernel void @kernel_1() { +; GCN-LABEL: entry: +; GCN: %0 = ptrtoint i32* addrspace(3)* @smem_ptr to i16 +; GCN-NEXT: store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.1, align 2 +; GCN-NEXT: call void @function_1() +; GCN-NEXT: ret void +entry: + call void @function_1() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_types_misc.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_types_misc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_types_misc.ll @@ -0,0 +1,39 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s + +; LDS: @lds_global_1 = internal addrspace(3) global [65 x i32] undef, align 16 +; LDS: @lds_global_2 = internal addrspace(3) global [65 x i16] undef, align 16 +; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +@lds_global_1 = internal addrspace(3) global [65 x i32] undef, align 16 +@lds_global_2 = internal addrspace(3) global [65 x i16] undef, align 16 + +define internal void @function_2() { +; GCN-LABEL: entry: +; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = inttoptr i16 %0 to [65 x i16] addrspace(3)* +; GCN-NEXT: %gep = getelementptr inbounds [65 x i16], [65 x i16] addrspace(3)* %1, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [65 x i16], [65 x i16] addrspace(3)* @lds_global_2, i32 0, i32 0 + ret void +} + +define internal void @function_1() { +; GCN-LABEL: entry: +; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN-NEXT: %1 = inttoptr i16 %0 to [65 x i32] addrspace(3)* +; GCN-NEXT: %gep = getelementptr inbounds [65 x i32], [65 x i32] addrspace(3)* %1, i32 0, i32 0 +; GCN-NEXT: ret void +entry: + %gep = getelementptr inbounds [65 x i32], [65 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0 + ret void +} + +; GCN: store i16 %{{[0-9]+}}, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN: store i16 %{{[0-9]+}}, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +define protected amdgpu_kernel void @kernel_1() { +entry: + call void @function_1() + call void @function_2() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_types_pointers.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_types_pointers.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_types_pointers.ll @@ -0,0 +1,55 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s + +; LDS: @smem_ptr = hidden addrspace(3) global i32* undef, align 8 +; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +@smem_ptr = hidden addrspace(3) global i32* undef, align 8 + +define internal void @function_2() { +; GCN-LABEL: entry: +; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.1, align 2 +; GCN-NEXT: %1 = inttoptr i16 %0 to i32* addrspace(3)* +; GCN-NEXT: %2 = addrspacecast i32* addrspace(3)* %1 to i32** +; GCN-NEXT: %3 = load i32*, i32** %2, align 8 +; GCN-NEXT: %4 = addrspacecast i32* addrspace(3)* %1 to i32** +; GCN-NEXT: store i32* %3, i32** %4, align 8 +; GCN-NEXT: ret void +entry: + %0 = load i32*, i32** addrspacecast (i32* addrspace(3)* @smem_ptr to i32**), align 8 + store i32* %0, i32** addrspacecast (i32* addrspace(3)* @smem_ptr to i32**), align 8 + ret void +} + +define internal void @function_1() { +; GCN-LABEL: entry: +; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.1, align 2 +; GCN-NEXT: %1 = inttoptr i16 %0 to i32* addrspace(3)* +; GCN-NEXT: %2 = addrspacecast i32* addrspace(3)* %1 to i32** +; GCN-NEXT: %3 = load i32*, i32** %2, align 8 +; GCN-NEXT: %4 = addrspacecast i32* addrspace(3)* %1 to i32** +; GCN-NEXT: store i32* %3, i32** %4, align 8 +; GCN-NEXT: ret void +entry: + %0 = load i32*, i32** addrspacecast (i32* addrspace(3)* @smem_ptr to i32**), align 8 + store i32* %0, i32** addrspacecast (i32* addrspace(3)* @smem_ptr to i32**), align 8 + ret void +} + +define protected amdgpu_kernel void @kernel_1() { +; GCN-LABEL: entry: +; GCN: %0 = ptrtoint i32* addrspace(3)* @smem_ptr to i16 +; GCN-NEXT: store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.1, align 2 +; GCN-NEXT: call void @function_1() +; GCN-NEXT: call void @function_2() +; GCN-NEXT: ret void +entry: + call void @function_1() + call void @function_2() + ret void +} + +define protected amdgpu_kernel void @kernel_2() { +; GCN-LABEL: entry: +; GCN: ret void +entry: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_types_pointers_misc.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_types_pointers_misc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_types_pointers_misc.ll @@ -0,0 +1,65 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s + +; LDS: @smem = hidden addrspace(3) global i32 undef, align 4 +; LDS: @smem_ptr = hidden addrspace(3) global i32* undef, align 8 +; LDS: @smem_ptr_ptr = hidden local_unnamed_addr addrspace(3) global i32** undef, align 8 +; LDS: @smem_arr = hidden addrspace(3) global [1 x i32] undef, align 4 +; LDS: @smem_ptr2 = hidden local_unnamed_addr addrspace(3) global i32* undef, align 8 +; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER: @llvm.amdgcn.lds.pointer.3 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER: @llvm.amdgcn.lds.pointer.4 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; POINTER: @llvm.amdgcn.lds.pointer.5 = internal unnamed_addr addrspace(3) global i16 undef, align 2 +@smem = hidden addrspace(3) global i32 undef, align 4 +@smem_ptr = hidden addrspace(3) global i32* undef, align 8 +@smem_ptr_ptr = hidden local_unnamed_addr addrspace(3) global i32** undef, align 8 +@smem_arr = hidden addrspace(3) global [1 x i32] undef, align 4 +@smem_ptr2 = hidden local_unnamed_addr addrspace(3) global i32* undef, align 8 + +; GCN: %{{[0-9]+}} = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN: %{{[0-9]+}} = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN: %{{[0-9]+}} = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +define internal void @function_2() { +entry: + store i32* addrspacecast (i32 addrspace(3)* @smem to i32*), i32* addrspace(3)* @smem_ptr, align 8 + store i32** addrspacecast (i32* addrspace(3)* @smem_ptr to i32**), i32** addrspace(3)* @smem_ptr_ptr, align 8 + %0 = load i32, i32 addrspace(3)* @smem, align 4 + ret void +} + +; GCN: %{{[0-9]+}} = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN: %{{[0-9]+}} = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN: %{{[0-9]+}} = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +define internal void @function_1() { +entry: + store i32* addrspacecast (i32 addrspace(3)* @smem to i32*), i32* addrspace(3)* @smem_ptr, align 8 + store i32** addrspacecast (i32* addrspace(3)* @smem_ptr to i32**), i32** addrspace(3)* @smem_ptr_ptr, align 8 + %0 = load i32, i32 addrspace(3)* @smem, align 4 + ret void +} + +; GCN: store i16 %{{[0-9]+}}, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN: store i16 %{{[0-9]+}}, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN: store i16 %{{[0-9]+}}, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +define protected amdgpu_kernel void @kernel_1() { +entry: + call void @function_1() + call void @function_2() + ret void +} + +; GCN: %{{[0-9]+}} = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN: %{{[0-9]+}} = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +define internal void @function_3() { +entry: + store i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(3)* @smem_arr, i32 0, i32 0) to i32*), i32* addrspace(3)* @smem_ptr2, align 8 + ret void +} + +; GCN: store i16 %{{[0-9]+}}, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +; GCN: store i16 %{{[0-9]+}}, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2 +define protected amdgpu_kernel void @kernel_2() { +entry: + call void @function_3() + ret void +}