diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -71,6 +71,7 @@ FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *); ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *); FunctionPass *createAMDGPURewriteOutArgumentsPass(); +ModulePass *createAMDGPUReplaceLDSUseWithPointerPass(); ModulePass *createAMDGPULowerModuleLDSPass(); FunctionPass *createSIModeRegisterPass(); @@ -146,6 +147,15 @@ TargetMachine &TM; }; +void initializeAMDGPUReplaceLDSUseWithPointerPass(PassRegistry &); +extern char &AMDGPUReplaceLDSUseWithPointerID; + +struct AMDGPUReplaceLDSUseWithPointerPass + : PassInfoMixin { + AMDGPUReplaceLDSUseWithPointerPass() {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + void initializeAMDGPULowerModuleLDSPass(PassRegistry &); extern char &AMDGPULowerModuleLDSID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -264,9 +264,16 @@ char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID; -INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE, - "Lower uses of LDS variables from non-kernel functions", false, - false) +INITIALIZE_PASS_BEGIN(AMDGPULowerModuleLDS, DEBUG_TYPE, + "Lower uses of LDS variables from non-kernel functions", + false, false) +// Before runnning current LDS lower pass, replace LDS uses within non-kernel +// functions by pointers so that the current pass minimizes the unnecessary per +// kernel allocation of LDS memory. +INITIALIZE_PASS_DEPENDENCY(AMDGPUReplaceLDSUseWithPointer) +INITIALIZE_PASS_END(AMDGPULowerModuleLDS, DEBUG_TYPE, + "Lower uses of LDS variables from non-kernel functions", + false, false) ModulePass *llvm::createAMDGPULowerModuleLDSPass() { return new AMDGPULowerModuleLDS(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp @@ -0,0 +1,598 @@ +//===-- AMDGPUReplaceLDSUseWithPointer.cpp --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The pass - "Lower Module LDS" supports use of LDS globals within non-kernel +// functions by lowering LDS globals as follows. It packs within non-kernel used +// LDS globals into a struct type, and creates an instance of that struct type +// within every kernel at "address zero". +// +// However, the pass - "Lower Module LDS" sometime wastes LDS memory depending +// on the pattern of LDS globals use within the module. +// +// Hence the current pass makes an attempt to aid the pass - "Lower Module LDS" +// for efficient LDS memory usage. The idea behind current pass is as follows: +// +// * Instead of directly packing LDS globals into the struct as struct members, +// create global LDS pointers correspoding those LDS globals. +// * Initialize those global LDS pointers with their respective LDS globals. +// * Replace all the non-kernel function scope use of those original LDS globals +// by their respective pointer counter-parts. +// * Then the pass "Lower Module LDS" by the virtue of its implementation idea, +// lands-up packing only LDS pointers as struct members, which substentially +// reduces unnecessary LDS memory usage. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "Utils/AMDGPULDSUtils.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include +#include +#include + +#define DEBUG_TYPE "amdgpu-replace-lds-use-with-pointer" + +using namespace llvm; + +namespace { + +// Error kinds for handling the errors within the context of current pass. +enum ReplaceLDSErrorKind : uint32_t { + LLEK_EndOfList = 0u, + LLEK_InternalError = 2u, + LLEK_NoCalleeDefinitionError = 3u +}; + +} // namespace + +// Report error within the context of current pass based on the error kind. +static void reportReplaceLDSError(ReplaceLDSErrorKind EK, Value *V = nullptr) { + std::string ErrStr("The pass \"Replace LDS Use With Pointer\" "); + + switch (EK) { + default: + case LLEK_InternalError: { + ErrStr = ErrStr + std::string("has encountered an internal error."); + break; + } + case LLEK_NoCalleeDefinitionError: { + ErrStr = + ErrStr + + std::string("assumes that the definitions of both caller and callee " + "appear within same module. But, definition for the " + "callee \"") + + V->getName().str() + std::string("\" not available."); + break; + } + } + + report_fatal_error(ErrStr); +} + +// Helper function around `ValueMap` to detect if an element exists within it. +template +static bool contains(R &&VMap, const E &Element) { + return VMap.find(Element) != VMap.end(); +} + +// Within User `U` replace the use(s) of `OldValue` by `NewValue`. +static void updateUserOperand(User *U, Value *OldValue, Value *NewValue) { + unsigned Ind = 0; + for (Use &UU : U->operands()) { + if (UU.get() == OldValue) + U->setOperand(Ind, NewValue); + ++Ind; + } +} + +// The instruction `I` contains const expressions(possibly nested) as its +// operands, convert those const expressions into corresponding instructions. +static void getInstructions(Instruction *I, std::set &Operands, + std::set &Insts) { + for (auto *V : Operands) { + auto *CE = dyn_cast(V); + if (!CE) + continue; + + auto *NI = CE->getAsInstruction(); + NI->insertBefore(I); + updateUserOperand(I, CE, NI); + CE->removeDeadConstantUsers(); + Insts.insert(NI); + + std::set Operands2; + for (Use &UU : CE->operands()) + Operands2.insert(UU.get()); + getInstructions(NI, Operands2, Insts); + } +} + +// Check if const exprssion `CE2` holds const expression `CE`, and return true +// if `CE` exist within `CE2`. +static bool isCEExist(ConstantExpr *CE, ConstantExpr *CE2) { + if (CE == CE2) + return true; + + bool CEExist = false; + + for (Use &UU : CE2->operands()) { + if (auto *CE3 = dyn_cast(UU.get())) + CEExist = CEExist | isCEExist(CE, CE3); + } + + return CEExist; +} + +// Collect all const expression operands of `I` which use `CE`. +static std::set getCEOperands(Instruction *I, ConstantExpr *CE) { + std::set CEOperands; + + for (Use &UU : I->operands()) { + auto *CE2 = dyn_cast(UU.get()); + if (CE2 && isCEExist(CE, CE2)) + CEOperands.insert(UU.get()); + } + + return CEOperands; +} + +// Collect all those non-kernel functions and all those instructions within +// which `U` exist. +static std::map> +getFunctionToInstsMap(User *U) { + std::map> FunctionToInsts; + SmallVector UserStack; + SmallPtrSet VisitedUsers; + + UserStack.push_back(U); + + while (!UserStack.empty()) { + auto *UU = UserStack.pop_back_val(); + + if (!VisitedUsers.insert(UU).second) + continue; + + if (isa(UU)) + continue; + + if (isa(UU)) { + append_range(UserStack, UU->users()); + continue; + } + + if (auto *I = dyn_cast(UU)) { + auto *F = I->getFunction(); + if (AMDGPU::isKernelCC(F)) + continue; + if (!contains(FunctionToInsts, F)) + FunctionToInsts[F] = std::set(); + FunctionToInsts[F].insert(I); + } + } + + return FunctionToInsts; +} + +// Collect all call graph nodes which are reachable from the node `CGN`. +static std::set +collectReachableCallGraphNodes(CallGraphNode *CGN) { + std::set ReachableCGNodes; + + for (scc_iterator I = scc_begin(CGN); !I.isAtEnd(); ++I) { + const std::vector &SCC = *I; + assert(!SCC.empty() && "SCC with no functions?"); + for (auto *CGNode : SCC) + ReachableCGNodes.insert(CGNode); + } + + return ReachableCGNodes; +} + +namespace { + +class ReplaceLDSUseImpl { + Module &M; + LLVMContext &Ctx; + const DataLayout &DL; + Constant *LDSMemBaseAddr; + + // Holds all kernels defined within the module `M`. + std::vector Kernels; + + // Holds all those LDS globals defined within the module `M` which require + // pointer replacement. + std::vector LDSGlobals; + + // Associates LDS global to a unique pointer which points to that LDS global. + std::map LDSToPointer; + + // Associates kernel K to LDS pointers which are initialized to point to + // corresponding LDS globals within K. + std::map> KernelToLDSPointers; + + // Associates non-kernel function to an LDS global to LDS global replacement + // instruction. + std::map> FunctionToLDSToInst; + +public: + explicit ReplaceLDSUseImpl(Module &M) + : M(M), Ctx(M.getContext()), DL(M.getDataLayout()) { + // FIXME: At present, we are assuming that LDS memory virtual base address + // starts from 0, though it is true, we should not make such assumptions at + // IR level. + LDSMemBaseAddr = Constant::getIntegerValue( + PointerType::get(Type::getInt8Ty(M.getContext()), + AMDGPUAS::LOCAL_ADDRESS), + APInt(32, 0)); + } + + // Entry-point function. + bool replace(); + +private: + // Create a set of replacement instructions which together replace `LDS` + // within `F` by accessing `LDS` indirectly using `LDSPointer`. + Value *getReplacementInst(Function *F, GlobalVariable *LDS, + GlobalVariable *LDSPointer); + + // Replace all the uses of LDS global `LDS ` with the associated pointer + // `LDSPointer`. + void replaceUsesOfLDSGlobalByPointer(GlobalVariable *LDS, + GlobalVariable *LDSPointer); + + // Initialize `LDSPointer` to point to `LDS` within kernel `K`. + void initializeLDSPointer(Function *K, GlobalVariable *LDS, + GlobalVariable *LDSPointer); + + // Insert new global LDS pointer which points to `LDS`. + GlobalVariable *createLDSPointer(GlobalVariable *LDS); + + // For the lds global `LDS`, recursively visit its user list and find all + // those non-kernel functions within which the `LDS` is being accessed. + std::set collectNonKernelAccessorsOfLDS(GlobalVariable *LDS); + + // Check if the pointer replacement of `LDS` is *not* required irrespective of + // if it is used within non-kernel function or not. + bool ignoreLDS(GlobalVariable *LDS, std::set &LDSAccessors); + + // Traverse `CallGraph` starting from the `CallGraphNode` associated with each + // kernel `K` and collect all callees which are reachable from K. + std::map> collectReachableCallees(); +}; + +// Create a set of replacement instructions which together replace `LDS` within +// `F` by accessing `LDS` indirectly using `LDSPointer`. +Value *ReplaceLDSUseImpl::getReplacementInst(Function *F, GlobalVariable *LDS, + GlobalVariable *LDSPointer) { + // The instruction which replaces `LDS` within `F` already created. + if (contains(FunctionToLDSToInst, F) && contains(FunctionToLDSToInst[F], LDS)) + return FunctionToLDSToInst[F][LDS]; + + // Get the instruction insertion point within the beginning of the entry + // block of current non-kernel function. + auto *EI = &(*(F->getEntryBlock().getFirstInsertionPt())); + IRBuilder<> Builder(EI); + + // Insert required set of instructions which replace `LDS` within `F`. + auto *V = Builder.CreateBitCast( + Builder.CreateGEP( + LDSMemBaseAddr, + Builder.CreateLoad(LDSPointer->getValueType(), LDSPointer)), + LDS->getType()); + + // Mark that the replacement instruction, which replace `LDS` within `F` is + // created. + if (!contains(FunctionToLDSToInst, F)) + FunctionToLDSToInst[F] = std::map(); + FunctionToLDSToInst[F][LDS] = V; + + return V; +} + +// Replace all the uses of LDS global `LDS ` with the associated pointer +// `LDSPointer`. +void ReplaceLDSUseImpl::replaceUsesOfLDSGlobalByPointer( + GlobalVariable *LDS, GlobalVariable *LDSPointer) { + SmallVector LDSUsers(LDS->users()); + for (auto *U : LDSUsers) { + // When `U` is a const expression, it is possible that same const expression + // exists within multiple instructions, and within multiple non-kernel + // functions. Collect all those non-kernel functions and all those + // instructions within which `U` exist. + auto FunctionToInsts = getFunctionToInstsMap(U); + + for (auto FI = FunctionToInsts.begin(), FE = FunctionToInsts.end(); + FI != FE; ++FI) { + for (auto *I : FI->second) { + // If `U` is a const expression, then we need to break the associated + // instruction into a set of separate instructions by converting const + // expressions into instructions. + std::set Insts; + + if (I == U) { + // `U` is an instruction, conversion from const expressions to + // instructions is *not* required. + Insts.insert(I); + } else { + // `U` is a const expression, convert all associated const expressions + // (including U) to corresponding instructions. + auto *CE = dyn_cast(U); + assert(CE && "Expected constant expression."); + auto CEOperands = getCEOperands(I, CE); + getInstructions(I, CEOperands, Insts); + } + + // Go through all the instrutions, if `LDS` exist within them as an + // operand, then replace it by `V`. + for (auto *II : Insts) { + auto *V = getReplacementInst(FI->first, LDS, LDSPointer); + updateUserOperand(II, LDS, V); + } + } + } + } +} + +// Initialize `LDSPointer` to point to `LDS` within kernel `K`. +void ReplaceLDSUseImpl::initializeLDSPointer(Function *K, GlobalVariable *LDS, + GlobalVariable *LDSPointer) { + // `LDSPointer` is already initialized within `K`. + if (contains(KernelToLDSPointers, K) && + contains(KernelToLDSPointers[K], LDSPointer)) + return; + + // Insert instructions at `EI` which initialize `LDSPointer` to point-to `LDS` + // within `K`. + auto *EI = &(*(K->getEntryBlock().getFirstInsertionPt())); + IRBuilder<> Builder(EI); + Builder.CreateStore(Builder.CreatePtrToInt(LDS, Type::getInt16Ty(Ctx)), + LDSPointer); + + // Mark that `LDSPointer` is initialized within `K`. + if (!contains(KernelToLDSPointers, K)) + KernelToLDSPointers[K] = std::set(); + KernelToLDSPointers[K].insert(LDSPointer); +} + +// Insert new global LDS pointer which points to `LDS`. +GlobalVariable *ReplaceLDSUseImpl::createLDSPointer(GlobalVariable *LDS) { + // LDS pointer which points to `LDS` is already created. + if (contains(LDSToPointer, LDS)) + return LDSToPointer[LDS]; + + // Create new LDS pointer which points to `LDS`. + auto *I16Ty = Type::getInt16Ty(Ctx); + GlobalVariable *LDSPointer = new GlobalVariable( + M, I16Ty, false, GlobalValue::InternalLinkage, UndefValue::get(I16Ty), + LDS->getName() + Twine(".offset"), nullptr, + GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); + LDSPointer->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + LDSPointer->setAlignment(AMDGPU::getAlign(M.getDataLayout(), LDSPointer)); + + // Mark that an associated LDS pointer is created for `LDS`. + LDSToPointer[LDS] = LDSPointer; + + return LDSPointer; +} + +// For the lds global `LDS`, recursively visit its user list and find all those +// non-kernel functions within which the `LDS` is being accessed. +std::set +ReplaceLDSUseImpl::collectNonKernelAccessorsOfLDS(GlobalVariable *LDS) { + std::set LDSAccessors; + std::set VisitedUsers; + SmallVector UserStack(LDS->users()); + + while (!UserStack.empty()) { + auto *U = UserStack.pop_back_val(); + + // `U` is already visited? continue to next one. + if (!VisitedUsers.insert(U).second) + continue; + + // `U` is a global variable which is initialized with `LDS`. Ignore LDS. + if (isa(U)) + return std::set(); + + // `U` is `Constant`. Push-back users of `U`, and continue further + // exploring the stack until an `Instruction` is found. + if (isa(U)) { + append_range(UserStack, U->users()); + continue; + } + + // `U` should be an instruction, if it belongs to a non-kernel function F, + // then collect F. + if (auto *I = dyn_cast(U)) { + auto *F = I->getFunction(); + if (!AMDGPU::isKernelCC(F)) + LDSAccessors.insert(F); + } else + reportReplaceLDSError(LLEK_InternalError); + } + + return LDSAccessors; +} + +// Check if the pointer replacement of `LDS` is *not* required irrespective of +// if it is used within non-kernel function or not. +bool ReplaceLDSUseImpl::ignoreLDS(GlobalVariable *LDS, + std::set &LDSAccessors) { + // Ignore `LDS` if its size is too small. Current threshold is 8 bytes. + if (AMDGPU::getLDSGlobalSizeInBytes(M, LDS) <= 8) + return true; + + // There are *no* non-kernel functions which access `LDS` OR LDS is used + // within global scope in addition to non-kernel function scope. Ignore `LDS`. + if (LDSAccessors.empty()) + return true; + + return false; +} + +// Traverse `CallGraph` starting from the `CallGraphNode` associated with each +// kernel `K` and collect all the callees which are reachable from K (including +// indirectly called callees). +std::map> +ReplaceLDSUseImpl::collectReachableCallees() { + // Associates kernel to a list of non-kernel functions which are reachable + // from that kernel. + std::map> KernelToCallees; + + // Create the call graph `CG` of the module `M`, collect all the address taken + // functions, and explore `CG` to collect all the reachable callees (including + // indirectly called callees) from all kernels. + CallGraph CG = CallGraph(M); + + for (auto *K : Kernels) { + // Get `CallGraphNode` representing kernel `K`. + auto *KernCGNode = CG[K]; + + // Collect all call graph nodes which are reachable from `KernCGNode`. + std::set ReachableCGNodes = + collectReachableCallGraphNodes(KernCGNode); + + // Remove `CallGraphNode` representing kernel `K` from reachable node set. + ReachableCGNodes.erase(KernCGNode); + + // Collect all reachable callees from K. + std::set ReachableCallees; + for (auto *CGNode : ReachableCGNodes) { + if (auto *Callee = CGNode->getFunction()) + ReachableCallees.insert(Callee); + } + + KernelToCallees[K] = ReachableCallees; + } + + return KernelToCallees; +} + +// Entry-point function. +bool ReplaceLDSUseImpl::replace() { + // Track if this pass update the module. + bool Changed = false; + + // If there are *no* kernels defined within the module, or if there are *no* + // LDS globals which actually require pointer replacement, then nothing to do. + Kernels = AMDGPU::collectKernels(M); + LDSGlobals = AMDGPU::findVariablesToLower(M, AMDGPU::getUsedList(M)); + if (Kernels.empty() || LDSGlobals.empty()) + return false; + + // Traverse `CallGraph` starting from the `CallGraphNode` associated with each + // kernel `K` and collect all callees which are reachable from K. + std::map> KernelToCallees = + collectReachableCallees(); + + // If there are *no* non-kernel functions which are reachable from any of the + // kernels, then nothing to do. + if (KernelToCallees.empty()) + return false; + + // For each collected LDS global, if required, create an associated global LDS + // pointer, initialize it within all relavent kernels, and finally replace all + // uses of original LDS globals by their pointer counter-parts. + for (auto *LDS : LDSGlobals) { + // For the lds global `LDS`, recursively visit its user list and find all + // those non-kernel functions within which the `LDS` is being accessed. + std::set LDSAccessors = collectNonKernelAccessorsOfLDS(LDS); + + // Check if the pointer replacement of `LDS` is *not* required irrespective + // of if it is used within non-kernel functions or not. + if (ignoreLDS(LDS, LDSAccessors)) + continue; + + // The global LDS pointer which points to `LDS` and replaces all the uses of + // `LDS`. + GlobalVariable *LDSPointer = nullptr; + + // Traverse through each kernel `K`, check and if required, initialize the + // `LDSPointer` to point to `LDS` within `K`. + for (auto KI = KernelToCallees.begin(), KE = KernelToCallees.end(); + KI != KE; ++KI) { + Function *K = KI->first; + std::set ReachableCallees = KI->second; + + std::set ReachableAndLDSUsedCallees; + std::set_intersection(LDSAccessors.begin(), LDSAccessors.end(), + ReachableCallees.begin(), ReachableCallees.end(), + std::inserter(ReachableAndLDSUsedCallees, + ReachableAndLDSUsedCallees.begin())); + + // None of the LDS accessing non-kernel functions are reachable from + // kernel `K`. Hence, no need to initialize `LDSPointer` within `K`. + if (ReachableAndLDSUsedCallees.empty()) + continue; + + // If it is first time encoutered, create a new global LDS pointer which + // points to `LDS`. + LDSPointer = createLDSPointer(LDS); + + // Initialize `LDSPointer` to point to `LDS` within kernel `K`. + initializeLDSPointer(K, LDS, LDSPointer); + } + + // Replace all the uses of LDS global `LDS ` with the associated pointer + // `LDSPointer`. + if (LDSPointer) { + replaceUsesOfLDSGlobalByPointer(LDS, LDSPointer); + Changed = true; + } + } + + return Changed; +} + +class AMDGPUReplaceLDSUseWithPointer : public ModulePass { +public: + static char ID; + + AMDGPUReplaceLDSUseWithPointer() : ModulePass(ID) { + initializeAMDGPUReplaceLDSUseWithPointerPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; +}; + +} // namespace + +char AMDGPUReplaceLDSUseWithPointer::ID = 0; +char &llvm::AMDGPUReplaceLDSUseWithPointerID = + AMDGPUReplaceLDSUseWithPointer::ID; + +INITIALIZE_PASS(AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE, + "Replace within non-kernel function use of LDS with pointer", + false /*only look at the cfg*/, false /*analysis pass*/) + +bool AMDGPUReplaceLDSUseWithPointer::runOnModule(Module &M) { + ReplaceLDSUseImpl LDSReplacer{M}; + return LDSReplacer.replace(); +} + +ModulePass *llvm::createAMDGPUReplaceLDSUseWithPointerPass() { + return new AMDGPUReplaceLDSUseWithPointer(); +} + +PreservedAnalyses +AMDGPUReplaceLDSUseWithPointerPass::run(Module &M, ModuleAnalysisManager &AM) { + ReplaceLDSUseImpl LDSReplacer{M}; + LDSReplacer.replace(); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -193,6 +193,11 @@ cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden); +static cl::opt EnableLDSReplaceWithPointer( + "amdgpu-enable-lds-replace-with-pointer", + cl::desc("Enable LDS replace with pointer pass"), cl::init(true), + cl::Hidden); + static cl::opt EnableLowerModuleLDS( "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), @@ -240,6 +245,7 @@ initializeAMDGPULateCodeGenPreparePass(*PR); initializeAMDGPUPropagateAttributesEarlyPass(*PR); initializeAMDGPUPropagateAttributesLatePass(*PR); + initializeAMDGPUReplaceLDSUseWithPointerPass(*PR); initializeAMDGPULowerModuleLDSPass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); @@ -506,6 +512,10 @@ PM.addPass(AMDGPUAlwaysInlinePass()); return true; } + if (PassName == "amdgpu-replace-lds-use-with-pointer") { + PM.addPass(AMDGPUReplaceLDSUseWithPointerPass()); + return true; + } if (PassName == "amdgpu-lower-module-lds") { PM.addPass(AMDGPULowerModuleLDSPass()); return true; @@ -890,6 +900,10 @@ // Replace OpenCL enqueued block function pointers with global variables. addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); + // This pass need to be run before "amdgpu-lower-module-lds" pass. + if (EnableLDSReplaceWithPointer) + addPass(createAMDGPUReplaceLDSUseWithPointerPass()); + // Can increase LDS used by kernel so runs before PromoteAlloca if (EnableLowerModuleLDS) addPass(createAMDGPULowerModuleLDSPass()); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -81,6 +81,7 @@ AMDGPUPropagateAttributes.cpp AMDGPURegBankCombiner.cpp AMDGPURegisterBankInfo.cpp + AMDGPUReplaceLDSUseWithPointer.cpp AMDGPURewriteOutArguments.cpp AMDGPUSubtarget.cpp AMDGPUTargetMachine.cpp diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h @@ -29,8 +29,12 @@ std::vector findVariablesToLower(Module &M, const SmallPtrSetImpl &UsedList); +std::vector collectKernels(Module &M); + SmallPtrSet getUsedList(Module &M); +unsigned getLDSGlobalSizeInBytes(Module &M, const GlobalVariable *LDS); + } // end namespace AMDGPU } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp @@ -108,6 +108,17 @@ return LocalVars; } +std::vector collectKernels(Module &M) { + std::vector Kernels; + for (auto &F : M.functions()) { + // Collect `F` if it is a definition of an entry point function. + if (!F.isDeclaration() && AMDGPU::isKernelCC(&F)) + Kernels.push_back(&F); + } + + return Kernels; +} + SmallPtrSet getUsedList(Module &M) { SmallPtrSet UsedList; @@ -122,6 +133,13 @@ return UsedList; } +unsigned getLDSGlobalSizeInBytes(Module &M, const GlobalVariable *LDS) { + auto *Ty = LDS->getValueType(); + auto SizeInBits = M.getDataLayout().getTypeSizeInBits(Ty).getFixedSize(); + auto SizeInBytes = SizeInBits / 8; + return SizeInBytes; +} + } // end namespace AMDGPU } // end namespace llvm diff --git a/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-call_diamond_shape.ll b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-call_diamond_shape.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-call_diamond_shape.ll @@ -0,0 +1,74 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck %s + +; Original LDS globals should exist as it is. +; CHECK: @lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4 + +; New global LDS pointers which point to original LDS globals must have been created. +; CHECK: @lds_used_within_func.offset = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Use of LDS globals within this function must have replaced by pointer counter-parts. +define internal void @func_uses_lds() { +; CHECK: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_func.offset, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_func, i32 0, i32 0 + ret void +} + +; This function remains unchanged +define internal void @func_does_not_use_lds_3() { +; CHECK: entry: +; CHECK: call void @func_uses_lds() +; CHECK: ret void +entry: + call void @func_uses_lds() + ret void +} + +; This function remains unchanged +define internal void @func_does_not_use_lds_2() { +; CHECK: entry: +; CHECK: call void @func_uses_lds() +; CHECK: ret void +entry: + call void @func_uses_lds() + ret void +} + +; This function remains unchanged +define internal void @func_does_not_use_lds_1() { +; CHECK: entry: +; CHECK: call void @func_does_not_use_lds_2() +; CHECK: call void @func_does_not_use_lds_3() +; CHECK: ret void +entry: + call void @func_does_not_use_lds_2() + call void @func_does_not_use_lds_3() + ret void +} + +; There is a call graph path from this kernel to `@func_uses_lds`, where LDS is being accessed, hence this kernel +; must do LDS pointer initialization. +define protected amdgpu_kernel void @reachable_kernel() { +; CHECK: entry: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_func to i16), i16 addrspace(3)* @lds_used_within_func.offset, align 2 +; CHECK: call void @func_does_not_use_lds_1() +; CHECK: ret void +entry: + call void @func_does_not_use_lds_1() + ret void +} + +; There is NO call graph path from this kernel to `@func_uses_lds` where LDS is being accessed, hence this kernel +; remains unchanged. +define protected amdgpu_kernel void @not_reachable_kernel() { +; CHECK: entry: +; CHECK: ret void +entry: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-call_miscellaneous.ll b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-call_miscellaneous.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-call_miscellaneous.ll @@ -0,0 +1,98 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck %s + +; Original LDS globals should exist as it is. +; CHECK: @lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4 +; CHECK: @lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4 +; CHECK: @lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4 + +; New global LDS pointers which point to original LDS globals must have been created. +; CHECK: @lds_used_within_function_1.offset = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds_used_within_function_2.offset = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds_used_within_function_3.offset = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Use of LDS globals within this function must have replaced by pointer counter-parts. +define internal void @function_3() { +; CHECK: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_3.offset, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_3, i32 0, i32 0 + ret void +} + +; Use of LDS globals within this function must have replaced by pointer counter-parts. +define internal void @function_2() { +; CHECK: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_2.offset, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_2, i32 0, i32 0 + ret void +} + +; Use of LDS globals within this function must have replaced by pointer counter-parts. +define internal void @function_1() { +; CHECK: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_1.offset, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_1, i32 0, i32 0 + ret void +} + +; This kernel calls functions 3 and 1, hence only lds globals which are used within functions 3 and 1 are +; considered here for corresponding pointer initialization. +define protected amdgpu_kernel void @kernel_calls_function_3_and_1() { +; CHECK: entry: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* @lds_used_within_function_3.offset, align 2 +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* @lds_used_within_function_1.offset, align 2 +; CHECK: call void @function_3() +; CHECK: call void @function_1() +; CHECK: ret void +entry: + call void @function_3() + call void @function_1() + ret void +} + +; This kernel calls functions 2 and 3, hence only lds globals which are used within functions 2 and 3 are +; considered here for corresponding pointer initialization. +define protected amdgpu_kernel void @kernel_calls_function_2_and_3() { +; CHECK: entry: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* @lds_used_within_function_3.offset, align 2 +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* @lds_used_within_function_2.offset, align 2 +; CHECK: call void @function_2() +; CHECK: call void @function_3() +; CHECK: ret void +entry: + call void @function_2() + call void @function_3() + ret void +} + +; This kernel calls functions 1 and 2, hence only lds globals which are used within functions 1 and 2 are +; considered here for corresponding pointer initialization. +define protected amdgpu_kernel void @kernel_calls_function_1_and_2() { +; CHECK: entry: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* @lds_used_within_function_2.offset, align 2 +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* @lds_used_within_function_1.offset, align 2 +; CHECK: call void @function_1() +; CHECK: call void @function_2() +; CHECK: ret void +entry: + call void @function_1() + call void @function_2() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-indirect_call_diamond_shape.ll b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-indirect_call_diamond_shape.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-indirect_call_diamond_shape.ll @@ -0,0 +1,78 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck %s + +; Original LDS globals should exist as it is. +; CHECK: @lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4 + +; New global LDS pointer should *NOT* have been created. +; CHECK-NOT: @lds_used_within_func.offset = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Other global variables should exist as it is. +; CHECK: @ptr_to_func = internal local_unnamed_addr externally_initialized global void ()* @func_uses_lds, align 8 +@ptr_to_func = internal local_unnamed_addr externally_initialized global void ()* @func_uses_lds, align 8 + +; Uses of LDS globals within this non-kernel function remains unchanged since this function is *INDIRECTLY* called. +define internal void @func_uses_lds() { +; CHECK: entry: +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_func, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_func, i32 0, i32 0 + ret void +} + +; This function remains unchanged +define internal void @func_does_not_use_lds_3() { +; CHECK: entry: +; CHECK: %fptr = load void ()*, void ()** @ptr_to_func, align 8 +; CHECK: call void %fptr() +; CHECK: ret void +entry: + %fptr = load void ()*, void ()** @ptr_to_func, align 8 + call void %fptr() + ret void +} + +; This function remains unchanged +define internal void @func_does_not_use_lds_2() { +; CHECK: entry: +; CHECK: %fptr = load void ()*, void ()** @ptr_to_func, align 8 +; CHECK: call void %fptr() +; CHECK: ret void +entry: + %fptr = load void ()*, void ()** @ptr_to_func, align 8 + call void %fptr() + ret void +} + +; This function remains unchanged +define internal void @func_does_not_use_lds_1() { +; CHECK: entry: +; CHECK: call void @func_does_not_use_lds_2() +; CHECK: call void @func_does_not_use_lds_3() +; CHECK: ret void +entry: + call void @func_does_not_use_lds_2() + call void @func_does_not_use_lds_3() + ret void +} + +; There is a call graph path from this kernel to `@func_uses_lds`, where LDS is being accessed, but `@func_uses_lds` +; is called *INDIRECTLY* and hence it is not reachable, and hence pointer replacement does not take place. +define protected amdgpu_kernel void @reachable_kernel() { +; CHECK: entry: +; CHECK: call void @func_does_not_use_lds_1() +; CHECK: ret void +entry: + call void @func_does_not_use_lds_1() + ret void +} + +; There is NO call graph path from this kernel to `@func_uses_lds` where LDS is being accessed, hence this kernel +; remains unchanged. +define protected amdgpu_kernel void @not_reachable_kernel() { +; CHECK: entry: +; CHECK: ret void +entry: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-small_lds.ll b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-small_lds.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-small_lds.ll @@ -0,0 +1,28 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck %s + +; Original LDS globals should exist as it is. +; CHECK: @small_lds = addrspace(3) global float undef, align 8 +@small_lds = addrspace(3) global float undef, align 8 + +; New global LDS pointers is not expected to be created. +; CHECK-NOT: @small_lds.offset = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; This function uses LDS global `@small_lds`, and is reachable from `@kern`, but since `@small_lds` too small +; for pointer replacement, it is ignored. +define void @func() { +; CHECK: entry: +; CHECK: %dec = atomicrmw fsub float addrspace(3)* @small_lds, float 1.000000e+00 monotonic, align 4 +; CHECK: ret void +entry: + %dec = atomicrmw fsub float addrspace(3)* @small_lds, float 1.0 monotonic + ret void +} + +define amdgpu_kernel void @kern() { +; CHECK: entry: +; CHECK: call void @func() +; CHECK: ret void +entry: + call void @func() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_both_within_kernel_and_func.ll b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_both_within_kernel_and_func.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_both_within_kernel_and_func.ll @@ -0,0 +1,34 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck %s + +; Original LDS globals should exist as it is +; CHECK: @lds_used_both_within_kernel_and_function = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_both_within_kernel_and_function = internal addrspace(3) global [4 x i32] undef, align 4 + +; New global LDS pointers which point to original LDS globals must have been created. +; CHECK: @lds_used_both_within_kernel_and_function.offset = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Uses of LDS globals within this non-kernel function should be replaced by pointers. +define internal void @func_uses_lds() { +; CHECK: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_both_within_kernel_and_function.offset, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_both_within_kernel_and_function, i32 0, i32 0 + ret void +} + +; Pointers should be initialized within this kernel, but, uses of original LDS within in this kernel *NO* need to be replaced. +define protected amdgpu_kernel void @kernel_uses_lds() { +; CHECK: entry: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_both_within_kernel_and_function to i16), i16 addrspace(3)* @lds_used_both_within_kernel_and_function.offset, align 2 +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_both_within_kernel_and_function, i32 0, i32 0 +; CHECK: call void @func_uses_lds() +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_both_within_kernel_and_function, i32 0, i32 0 + call void @func_uses_lds() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_only_within_func.ll b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_only_within_func.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_only_within_func.ll @@ -0,0 +1,32 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck %s + +; Original LDS globals should exist as it is +; CHECK: @lds_used_within_function = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function = internal addrspace(3) global [4 x i32] undef, align 4 + +; New global LDS pointers which point to original LDS globals must have been created. +; CHECK: @lds_used_within_function.offset = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Uses of LDS globals within this non-kernel function should be replaced by pointers. +define internal void @func_uses_lds() { +; CHECK: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function.offset, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function, i32 0, i32 0 + ret void +} + +; Pointers should be initialized within this kernel. +define protected amdgpu_kernel void @kernel_uses_lds() { +; CHECK: entry: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function to i16), i16 addrspace(3)* @lds_used_within_function.offset, align 2 +; CHECK: call void @func_uses_lds() +; CHECK: ret void +entry: + call void @func_uses_lds() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_only_within_kernel.ll b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_only_within_kernel.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_only_within_kernel.ll @@ -0,0 +1,19 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck %s + +; Original LDS globals should exist as it is. +; CHECK: @lds_used_within_kernel = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_kernel = internal addrspace(3) global [4 x i32] undef, align 4 + +; Since lds global is used *only* within kernel, there is no pointer replacement of lds global required, hence +; global pointer should *NOT* have created. +; CHECK-NOT: @lds_used_within_kernel.offset = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Kernel remains unchanged. +define protected amdgpu_kernel void @kernel_uses_lds() { +; CHECK: entry: +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_kernel, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_kernel, i32 0, i32 0 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_within_both_global_and_func_scope.ll b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_within_both_global_and_func_scope.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_within_both_global_and_func_scope.ll @@ -0,0 +1,36 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck %s + +; Original globals should exist as it is. +; CHECK: @ignored1 = internal addrspace(3) global [4 x i32] undef, align 4 +; CHECK: @ignored2 = addrspace(1) global i64 0 +; CHECK: @llvm.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @ignored1 to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored2 to i8 addrspace(1)*) to i8*)], section "llvm.metadata" +; CHECK: @llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @ignored1 to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored2 to i8 addrspace(1)*) to i8*)], section "llvm.metadata" +@ignored1 = internal addrspace(3) global [4 x i32] undef, align 4 +@ignored2 = addrspace(1) global i64 0 +@llvm.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @ignored1 to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored2 to i8 addrspace(1)*) to i8*)], section "llvm.metadata" +@llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @ignored1 to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored2 to i8 addrspace(1)*) to i8*)], section "llvm.metadata" + +; New global LDS pointers is not expected to be created. +; CHECK-NOT: @@ignored1.offset = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; This function uses LDS global `@ignored1`, and is reachable from `@kernel`, but since `@ignored1` is +; also used within global scope, pointer replacement is ignored. +define void @func() { +; CHECK: entry: +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @ignored1, i32 0, i32 0 +; CHECK: %unused0 = atomicrmw add i64 addrspace(1)* @ignored2, i64 1 monotonic +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @ignored1, i32 0, i32 0 + %unused0 = atomicrmw add i64 addrspace(1)* @ignored2, i64 1 monotonic + ret void +} + +define protected amdgpu_kernel void @kernel() { +; CHECK: entry: +; CHECK: call void @func() +; CHECK: ret void +entry: + call void @func() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_within_both_global_and_func_scope2.ll b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_within_both_global_and_func_scope2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_within_both_global_and_func_scope2.ll @@ -0,0 +1,30 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck %s + +; Original globals should exist as it is. +; CHECK: @ignored1 = internal addrspace(3) global [4 x i32] undef, align 4 +; CHECK: @ignored2 = addrspace(1) global float* addrspacecast (float addrspace(3)* bitcast ([4 x i32] addrspace(3)* @ignored1 to float addrspace(3)*) to float*), align 8 +@ignored1 = internal addrspace(3) global [4 x i32] undef, align 4 +@ignored2 = addrspace(1) global float* addrspacecast ([4 x i32] addrspace(3)* @ignored1 to float*), align 8 + +; New global LDS pointer is not expected to be created. +; CHECK-NOT: @@ignored1.offset = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; This function uses LDS global `@ignored1`, and is reachable from `@kernel`, but since `@ignored1` is +; also used within global scope, pointer replacement is ignored. +define void @func() { +; CHECK: entry: +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @ignored1, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @ignored1, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @kernel() { +; CHECK: entry: +; CHECK: call void @func() +; CHECK: ret void +entry: + call void @func() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_within_const_expr.ll b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_within_const_expr.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_within_const_expr.ll @@ -0,0 +1,72 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck %s + +; Original LDS globals should exist as it is. +@used_only_within_func = addrspace(3) global [4 x i32] undef, align 4 +@used_only_within_kern = addrspace(3) global [4 x i32] undef, align 4 +@used_within_both_func_and_kern = addrspace(3) global [4 x i32] undef, align 4 +; CHECK: @used_only_within_func = addrspace(3) global [4 x i32] undef, align 4 +; CHECK: @used_only_within_kern = addrspace(3) global [4 x i32] undef, align 4 +; CHECK: @used_within_both_func_and_kern = addrspace(3) global [4 x i32] undef, align 4 + +; LDS pointers should be created for vars `@used_only_within_func` and `@used_within_both_func_and_kern` +; since both of them are used within non-kernel functions, but not for var `@used_only_within_kern` since +; it is used only within kernel. +; CHECK: @used_only_within_func.offset = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @used_within_both_func_and_kern.offset = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK-NOT: @used_only_within_kern.offset = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; pointer replacement is required for `@used_only_within_func` +define i32 @get() { +; CHECK: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @used_only_within_func.offset, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: %4 = addrspacecast i32 addrspace(3)* %3 to i32* +; CHECK: %5 = ptrtoint i32* %4 to i64 +; CHECK: %6 = add i64 %5, %5 +; CHECK: %7 = inttoptr i64 %6 to i32* +; CHECK: %8 = load i32, i32* %7, align 4 +; CHECK: ret i32 %8 +entry: + %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + ret i32 %0 +} + +; pointer replacement is required for `@used_within_both_func_and_kern` +define void @set(i32 %x) { +; CHECK: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @used_within_both_func_and_kern.offset, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: %4 = addrspacecast i32 addrspace(3)* %3 to i32* +; CHECK: %5 = ptrtoint i32* %4 to i64 +; CHECK: %6 = add i64 %5, %5 +; CHECK: %7 = inttoptr i64 %6 to i32* +; CHECK: store i32 %x, i32* %7, align 4 +; CHECK: ret void +entry: + store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_within_both_func_and_kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_within_both_func_and_kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + ret void +} + +; pointer replacement is not required for `@used_only_within_kern` +define amdgpu_kernel void @timestwo() { +; CHECK: entry: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @used_within_both_func_and_kern to i16), i16 addrspace(3)* @used_within_both_func_and_kern.offset, align 2 +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @used_only_within_func to i16), i16 addrspace(3)* @used_only_within_func.offset, align 2 +; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_within_both_func_and_kern, i32 0, i32 0) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_kern, i32 0, i32 0) to i32*) to i64)) to i32*), align 4 +; CHECK: %mul = mul i32 %ld, 2 +; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_kern, i32 0, i32 0) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_within_both_func_and_kern, i32 0, i32 0) to i32*) to i64)) to i32*), align 4 +; CHECK: call void @set(i32 0) +; CHECK: %0 = call i32 @get() +; CHECK: ret void +entry: + %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_within_both_func_and_kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + %mul = mul i32 %ld, 2 + store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_within_both_func_and_kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + call void @set(i32 0) + %0 = call i32 @get() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_within_const_expr2.ll b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_within_const_expr2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_within_const_expr2.ll @@ -0,0 +1,50 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck %s + +; Original LDS globals should exist as it is. +@lds_used_within_function = internal addrspace(3) global [4 x i32] undef, align 4 +@global_pointer = addrspace(1) global i32 undef, align 4 + +; LDS pointer should be created for `@lds_used_within_function` +; CHECK: @lds_used_within_function.offset = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; pointer replacement for `@lds_used_within_function` required +define internal void @func_uses_lds2() { +; CHECK: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function.offset, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %3 = ptrtoint [4 x i32] addrspace(3)* %2 to i32 +; CHECK: %4 = add i32 %3, %3 +; CHECK: store i32 %4, i32 addrspace(1)* @global_pointer, align 4 +; CHECK: ret void +entry: + store i32 add (i32 ptrtoint (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function, i32 0, i32 0) to i32), i32 ptrtoint (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function, i32 0, i32 0) to i32)), i32 addrspace(1)* @global_pointer, align 4 + ret void +} + +; pointer replacement for `@lds_used_within_function` required +define internal void @func_uses_lds() { +; CHECK: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function.offset, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %3 = ptrtoint [4 x i32] addrspace(3)* %2 to i32 +; CHECK: %4 = add i32 %3, %3 +; CHECK: store i32 %4, i32 addrspace(1)* @global_pointer, align 4 +; CHECK: ret void +entry: + store i32 add (i32 ptrtoint (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function, i32 0, i32 0) to i32), i32 ptrtoint (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function, i32 0, i32 0) to i32)), i32 addrspace(1)* @global_pointer, align 4 + ret void +} + +define protected amdgpu_kernel void @kernel_uses_lds() { +; CHECK: entry: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function to i16), i16 addrspace(3)* @lds_used_within_function.offset, align 2 +; CHECK: call void @func_uses_lds() +; CHECK: call void @func_uses_lds2() +; CHECK: ret void +entry: + call void @func_uses_lds() + call void @func_uses_lds2() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_within_const_expr3.ll b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_within_const_expr3.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_within_const_expr3.ll @@ -0,0 +1,52 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck %s + +; Original LDS globals should exist as it is. +@lds_used_within_function = internal addrspace(3) global [4 x i32] undef, align 4 +@global_var = internal addrspace(1) global [4 x i32] undef, align 4 + +; LDS pointer should be created for `@lds_used_within_function` +; CHECK: @lds_used_within_function.offset = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; pointer replacement for `@lds_used_within_function` required +define internal void @func_uses_lds2() { +; CHECK: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function.offset, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 2 +; CHECK: %4 = addrspacecast i32 addrspace(3)* %3 to i32* +; CHECK: %5 = ptrtoint i32* %4 to i32 +; CHECK: %6 = add i32 %5, ptrtoint (i32 addrspace(1)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(1)* @global_var, i32 0, i32 2) to i32) +; CHECK: ret void +entry: + %0 = add i32 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function, i32 0, i32 2) to i32*) to i32), ptrtoint (i32 addrspace(1)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(1)* @global_var, i32 0, i32 2) to i32) + ret void +} + +; pointer replacement for `@lds_used_within_function` required +define internal void @func_uses_lds1() { +; CHECK: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function.offset, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 2 +; CHECK: %4 = addrspacecast i32 addrspace(3)* %3 to i32* +; CHECK: %5 = ptrtoint i32* %4 to i32 +; CHECK: %6 = add i32 %5, ptrtoint (i32 addrspace(1)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(1)* @global_var, i32 0, i32 2) to i32) +; CHECK: ret void +entry: + %0 = add i32 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function, i32 0, i32 2) to i32*) to i32), ptrtoint (i32 addrspace(1)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(1)* @global_var, i32 0, i32 2) to i32) + ret void +} + +define protected amdgpu_kernel void @kernel_uses_lds() { +; CHECK: entry: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function to i16), i16 addrspace(3)* @lds_used_within_function.offset, align 2 +; CHECK: call void @func_uses_lds1() +; CHECK: call void @func_uses_lds2() +; CHECK: ret void +entry: + call void @func_uses_lds1() + call void @func_uses_lds2() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_within_not_rechable_func.ll b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_within_not_rechable_func.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds_replace_by_pointer-use_within_not_rechable_func.ll @@ -0,0 +1,44 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck %s + +; Original LDS globals should exist as it is. +; CHECK: @lds_used_within_function = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function = internal addrspace(3) global [4 x i32] undef, align 4 + +; Other global variables should exist as it is. +; CHECK: @ptr_to_func = internal local_unnamed_addr externally_initialized global void ()* @func_uses_lds_2, align 8 +@ptr_to_func = internal local_unnamed_addr externally_initialized global void ()* @func_uses_lds_2, align 8 + +; New global LDS pointers is not expected to be created. +; CHECK-NOT: @lds_used_within_function.offset = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Uses of LDS globals within this non-kernel function remains unchanged since this function is not called. +define internal void @func_uses_lds_1() { +; CHECK: entry: +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function, i32 0, i32 0 + ret void +} + +; Uses of LDS globals within this non-kernel function remains unchanged since this function is *INDIRECTLY* called. +define internal void @func_uses_lds_2() { +; CHECK: entry: +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function, i32 0, i32 0 + ret void +} + +; Kernel remains unchanged. +define protected amdgpu_kernel void @kernel() { +; CHECK: entry: +; CHECK: %fptr = load void ()*, void ()** @ptr_to_func, align 8 +; CHECK: call void %fptr() +; CHECK: ret void +entry: + %fptr = load void ()*, void ()** @ptr_to_func, align 8 + call void %fptr() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll @@ -1,5 +1,5 @@ ; RUN: opt -S -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s -; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=ASM %s +; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-lds-replace-with-pointer=false -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=ASM %s target datalayout = "A5"