diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -71,6 +71,7 @@ FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *); ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *); FunctionPass *createAMDGPURewriteOutArgumentsPass(); +ModulePass *createAMDGPUReplaceLDSUseWithPointerPass(); ModulePass *createAMDGPULowerModuleLDSPass(); FunctionPass *createSIModeRegisterPass(); @@ -146,6 +147,15 @@ TargetMachine &TM; }; +void initializeAMDGPUReplaceLDSUseWithPointerPass(PassRegistry &); +extern char &AMDGPUReplaceLDSUseWithPointerID; + +struct AMDGPUReplaceLDSUseWithPointerPass + : PassInfoMixin { + AMDGPUReplaceLDSUseWithPointerPass() {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + void initializeAMDGPULowerModuleLDSPass(PassRegistry &); extern char &AMDGPULowerModuleLDSID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -24,6 +24,13 @@ // A possible future refinement is to specialise the structure per-kernel, so // that fields can be elided based on more expensive analysis. // +// NOTE: Since this pass will directly pack LDS (assume large LDS) into a struct +// type which would cause allocating huge memory for struct instance within +// every kernel. Hence, before running this pass, it is advisable to run the +// pass "amdgpu-replace-lds-use-with-pointer" which will replace LDS uses within +// non-kernel functions by pointers and thereby minimizes the unnecessary per +// kernel allocation of LDS memory. +// //===----------------------------------------------------------------------===// #include "AMDGPU.h" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp @@ -0,0 +1,418 @@ +//===-- AMDGPUReplaceLDSUseWithPointer.cpp --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass replaces all the uses of LDS within non-kernel functions by +// corresponding pointer counter-parts. +// +// The main motivation behind this pass is - to *avoid* subsequent LDS lowering +// pass from directly packing LDS (assume large LDS) into a struct type which +// would otherwise cause allocating huge memory for struct instance within every +// kernel. +// +// Brief sketch of the algorithm implemented in this pass is as below: +// +// 1. Collect all the LDS defined in the module which qualify for pointer +// replacement, say it is, LDSGlobals set. +// +// 2. Collect all the reachable callees for each kernel defined in the module, +// say it is, KernelToCallees map. +// +// 3. FOR (each global GV from LDSGlobals set) DO +// LDSUsedNonKernels = Collect all non-kernel functions which use GV. +// FOR (each kernel K in KernelToCallees map) DO +// ReachableCallees = KernelToCallees[K] +// ReachableAndLDSUsedCallees = +// SetIntersect(LDSUsedNonKernels, ReachableCallees) +// IF (ReachableAndLDSUsedCallees is not empty) THEN +// Pointer = Create a pointer to point-to GV if not created. +// Initialize Pointer to point-to GV within kernel K. +// ENDIF +// ENDFOR +// Replace all uses of GV within non kernel functions by Pointer. +// ENFOR +// +// LLVM IR example: +// +// Input IR: +// +// @lds = internal addrspace(3) global [4 x i32] undef, align 16 +// +// define internal void @f0() { +// entry: +// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds, +// i32 0, i32 0 +// ret void +// } +// +// define protected amdgpu_kernel void @k0() { +// entry: +// call void @f0() +// ret void +// } +// +// Output IR: +// +// @lds = internal addrspace(3) global [4 x i32] undef, align 16 +// @lds.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +// +// define internal void @f0() { +// entry: +// %0 = load i16, i16 addrspace(3)* @lds.ptr, align 2 +// %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +// %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, +// i32 0, i32 0 +// ret void +// } +// +// define protected amdgpu_kernel void @k0() { +// entry: +// store i16 ptrtoint ([4 x i32] addrspace(3)* @lds to i16), +// i16 addrspace(3)* @lds.ptr, align 2 +// call void @f0() +// ret void +// } +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "Utils/AMDGPULDSUtils.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetOperations.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/ReplaceConstant.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" +#include +#include + +#define DEBUG_TYPE "amdgpu-replace-lds-use-with-pointer" + +using namespace llvm; + +namespace { + +class ReplaceLDSUseImpl { + Module &M; + LLVMContext &Ctx; + const DataLayout &DL; + Constant *LDSMemBaseAddr; + + DenseMap LDSToPointer; + DenseMap> LDSToNonKernels; + DenseMap> KernelToCallees; + DenseMap> KernelToLDSPointers; + DenseMap> + FunctionToLDSToReplaceInst; + + // Collect LDS which requires their uses to be replaced by pointer. + std::vector collectLDSRequiringPointerReplace() { + auto UsedList = AMDGPU::getUsedList(M); + + // Collect LDS which requires module lowering. + std::vector LDSGlobals = + AMDGPU::findVariablesToLower(M, UsedList); + + // Remove LDS which don't qualify for replacement. + LDSGlobals.erase(std::remove_if(LDSGlobals.begin(), LDSGlobals.end(), + [&](GlobalVariable *GV) { + return shouldIgnorePointerReplacement(GV); + }), + LDSGlobals.end()); + + return LDSGlobals; + } + + // Returns true if uses of given LDS global within non-kernel functions should + // be keep as it is without pointer replacement. + bool shouldIgnorePointerReplacement(GlobalVariable *GV) { + // LDS whose size is very small and doesn`t exceed pointer size is not worth + // replacing. + if (DL.getTypeAllocSize(GV->getValueType()) <= 2) + return true; + + // LDS which is not used from non-kernel function scope or it is used from + // global scope does not qualify for replacement. + LDSToNonKernels[GV] = AMDGPU::collectNonKernelAccessorsOfLDS(GV); + return LDSToNonKernels[GV].empty(); + + // FIXME: Any other scenarios which disqualify LDS from replacement? + } + + // Insert new global LDS pointer which points to LDS. + GlobalVariable *createLDSPointer(GlobalVariable *GV) { + // LDS pointer which points to LDS is already created? return it. + auto PointerEntry = LDSToPointer.insert(std::make_pair(GV, nullptr)); + if (!PointerEntry.second) + return PointerEntry.first->second; + + // We need to create new LDS pointer which points to LDS. + // + // Each CU owns at max 64K of LDS memory, so LDS address ranges from 0 to + // 2^16 - 1. Hence 16 bit pointer is enough to hold the LDS address. + auto *I16Ty = Type::getInt16Ty(Ctx); + GlobalVariable *LDSPointer = new GlobalVariable( + M, I16Ty, false, GlobalValue::InternalLinkage, UndefValue::get(I16Ty), + GV->getName() + Twine(".ptr"), nullptr, GlobalVariable::NotThreadLocal, + AMDGPUAS::LOCAL_ADDRESS); + + LDSPointer->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + LDSPointer->setAlignment(AMDGPU::getAlign(DL, LDSPointer)); + + // Mark that an associated LDS pointer is created for LDS. + LDSToPointer[GV] = LDSPointer; + + return LDSPointer; + } + + // Within given kernel, initialize given LDS pointer to point to given LDS. + void initializeLDSPointer(Function *K, GlobalVariable *GV, + GlobalVariable *LDSPointer) { + // If LDS pointer is already initialized within K, then nothing to do. + auto PointerEntry = KernelToLDSPointers.insert( + std::make_pair(K, SmallPtrSet())); + if (!PointerEntry.second) + if (PointerEntry.first->second.contains(LDSPointer)) + return; + + // Insert instructions at EI which initialize LDS pointer to point-to LDS + // within kernel K. + // + // That is, convert pointer type of GV to i16, and then store this converted + // i16 value within LDSPointer which is of type i16*. + auto *EI = &(*(K->getEntryBlock().getFirstInsertionPt())); + IRBuilder<> Builder(EI); + Builder.CreateStore(Builder.CreatePtrToInt(GV, Type::getInt16Ty(Ctx)), + LDSPointer); + + // Mark that LDS pointer is initialized within kernel K. + KernelToLDSPointers[K].insert(LDSPointer); + } + + // We have created an LDS pointer for LDS, and initialized it to point-to LDS + // within all relevent kernels. Now replace all the uses of LDS within + // non-kernel functions by LDS pointer. + void replaceLDSUseByPointer(GlobalVariable *GV, GlobalVariable *LDSPointer) { + SmallVector LDSUsers(GV->users()); + for (auto *U : LDSUsers) { + // When `U` is a constant expression, it is possible that same constant + // expression exists within multiple instructions, and within multiple + // non-kernel functions. Collect all those non-kernel functions and all + // those instructions within which `U` exist. + auto FunctionToInsts = + AMDGPU::getFunctionToInstsMap(U, false /*=CollectKernelInsts*/); + + for (auto FI = FunctionToInsts.begin(), FE = FunctionToInsts.end(); + FI != FE; ++FI) { + Function *F = FI->first; + auto &Insts = FI->second; + for (auto *I : Insts) { + // If `U` is a constant expression, then we need to break the + // associated instruction into a set of separate instructions by + // converting constant expressions into instructions. + SmallPtrSet UserInsts; + + if (U == I) { + // `U` is an instruction, conversion from constant expression to + // set of instructions is *not* required. + UserInsts.insert(I); + } else { + // `U` is a constant expression, convert it into corresponding set + // of instructions. + auto *CE = cast(U); + convertConstantExprsToInstructions(I, CE, &UserInsts); + } + + // Go through all the user instrutions, if LDS exist within them as an + // operand, then replace it by replace instruction. + for (auto *II : UserInsts) { + auto *ReplaceInst = getReplacementInst(F, GV, LDSPointer); + II->replaceUsesOfWith(GV, ReplaceInst); + } + } + } + } + } + + // Create a set of replacement instructions which together replace LDS within + // non-kernel function F by accessing LDS indirectly using LDS pointer. + Value *getReplacementInst(Function *F, GlobalVariable *GV, + GlobalVariable *LDSPointer) { + // If the instruction which replaces LDS within F is already created, then + // return it. + auto LDSEntry = FunctionToLDSToReplaceInst.insert( + std::make_pair(F, DenseMap())); + if (!LDSEntry.second) { + auto ReplaceInstEntry = + LDSEntry.first->second.insert(std::make_pair(GV, nullptr)); + if (!ReplaceInstEntry.second) + return ReplaceInstEntry.first->second; + } + + // Get the instruction insertion point within the beginning of the entry + // block of current non-kernel function. + auto *EI = &(*(F->getEntryBlock().getFirstInsertionPt())); + IRBuilder<> Builder(EI); + + // Insert required set of instructions which replace LDS within F. + auto *V = Builder.CreateBitCast( + Builder.CreateGEP( + LDSMemBaseAddr, + Builder.CreateLoad(LDSPointer->getValueType(), LDSPointer)), + GV->getType()); + + // Mark that the replacement instruction which replace LDS within F is + // created. + FunctionToLDSToReplaceInst[F][GV] = V; + + return V; + } + +public: + ReplaceLDSUseImpl(Module &M) + : M(M), Ctx(M.getContext()), DL(M.getDataLayout()) { + LDSMemBaseAddr = Constant::getIntegerValue( + PointerType::get(Type::getInt8Ty(M.getContext()), + AMDGPUAS::LOCAL_ADDRESS), + APInt(32, 0)); + } + + // Entry-point function which interface ReplaceLDSUseImpl with outside of the + // class. + bool replaceLDSUse(); + +private: + // For a given LDS from collected LDS globals set, replace its non-kernel + // function scope uses by pointer. + bool replaceLDSUse(GlobalVariable *GV); +}; + +// For given LDS from collected LDS globals set, replace its non-kernel function +// scope uses by pointer. +bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) { + // Holds all those non-kernel functions within which LDS is being accessed. + SmallPtrSet &LDSAccessors = LDSToNonKernels[GV]; + + // The LDS pointer which points to LDS and replaces all the uses of LDS. + GlobalVariable *LDSPointer = nullptr; + + // Traverse through each kernel K, check and if required, initialize the + // LDS pointer to point to LDS within K. + for (auto KI = KernelToCallees.begin(), KE = KernelToCallees.end(); KI != KE; + ++KI) { + Function *K = KI->first; + SmallPtrSet Callees = KI->second; + + // Compute reachable and LDS used callees for kernel K. + set_intersect(Callees, LDSAccessors); + + // None of the LDS accessing non-kernel functions are reachable from + // kernel K. Hence, no need to initialize LDS pointer within kernel K. + if (Callees.empty()) + continue; + + // We have found reachable and LDS used callees for kernel K, and we need to + // initialize LDS pointer within kernel K, and we need to replace LDS use + // within those callees by LDS pointer. + // + // But, first check if LDS pointer is already created, if not create one. + LDSPointer = createLDSPointer(GV); + + // Initialize LDS pointer to point to LDS within kernel K. + initializeLDSPointer(K, GV, LDSPointer); + } + + // We have not found reachable and LDS used callees for any of the kernels, + // and hence we have not created LDS pointer. + if (!LDSPointer) + return false; + + // We have created an LDS pointer for LDS, and initialized it to point-to LDS + // within all relevent kernels. Now replace all the uses of LDS within + // non-kernel functions by LDS pointer. + replaceLDSUseByPointer(GV, LDSPointer); + + return true; +} + +// Entry-point function which interface ReplaceLDSUseImpl with outside of the +// class. +bool ReplaceLDSUseImpl::replaceLDSUse() { + // Collect LDS which requires their uses to be replaced by pointer. + std::vector LDSGlobals = + collectLDSRequiringPointerReplace(); + + // No LDS to pointer-replace. Nothing to do. + if (LDSGlobals.empty()) + return false; + + // Collect reachable callee set for each kernel defined in the module. + AMDGPU::collectReachableCallees(M, KernelToCallees); + + if (KernelToCallees.empty()) { + // Either module does not have any kernel definitions, or none of the kernel + // has a call to non-kernel functions, or we could not resolve any of the + // call sites to proper non-kernel functions, because of the situations like + // inline asm calls. Nothing to replace. + return false; + } + + // For every LDS from collected LDS globals set, replace its non-kernel + // function scope use by pointer. + bool Changed = false; + for (auto *GV : LDSGlobals) + Changed |= replaceLDSUse(GV); + + return Changed; +} + +class AMDGPUReplaceLDSUseWithPointer : public ModulePass { +public: + static char ID; + + AMDGPUReplaceLDSUseWithPointer() : ModulePass(ID) { + initializeAMDGPUReplaceLDSUseWithPointerPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override; +}; + +} // namespace + +char AMDGPUReplaceLDSUseWithPointer::ID = 0; +char &llvm::AMDGPUReplaceLDSUseWithPointerID = + AMDGPUReplaceLDSUseWithPointer::ID; + +INITIALIZE_PASS(AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE, + "Replace within non-kernel function use of LDS with pointer", + false /*only look at the cfg*/, false /*analysis pass*/) + +bool AMDGPUReplaceLDSUseWithPointer::runOnModule(Module &M) { + ReplaceLDSUseImpl LDSUseReplacer{M}; + return LDSUseReplacer.replaceLDSUse(); +} + +ModulePass *llvm::createAMDGPUReplaceLDSUseWithPointerPass() { + return new AMDGPUReplaceLDSUseWithPointer(); +} + +PreservedAnalyses +AMDGPUReplaceLDSUseWithPointerPass::run(Module &M, ModuleAnalysisManager &AM) { + ReplaceLDSUseImpl LDSUseReplacer{M}; + LDSUseReplacer.replaceLDSUse(); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -193,6 +193,11 @@ cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden); +static cl::opt EnableLDSReplaceWithPointer( + "amdgpu-enable-lds-replace-with-pointer", + cl::desc("Enable LDS replace with pointer pass"), cl::init(true), + cl::Hidden); + static cl::opt EnableLowerModuleLDS( "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), @@ -240,6 +245,7 @@ initializeAMDGPULateCodeGenPreparePass(*PR); initializeAMDGPUPropagateAttributesEarlyPass(*PR); initializeAMDGPUPropagateAttributesLatePass(*PR); + initializeAMDGPUReplaceLDSUseWithPointerPass(*PR); initializeAMDGPULowerModuleLDSPass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); @@ -505,6 +511,10 @@ PM.addPass(AMDGPUAlwaysInlinePass()); return true; } + if (PassName == "amdgpu-replace-lds-use-with-pointer") { + PM.addPass(AMDGPUReplaceLDSUseWithPointerPass()); + return true; + } if (PassName == "amdgpu-lower-module-lds") { PM.addPass(AMDGPULowerModuleLDSPass()); return true; @@ -889,8 +899,15 @@ addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); // Can increase LDS used by kernel so runs before PromoteAlloca - if (EnableLowerModuleLDS) + if (EnableLowerModuleLDS) { + // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the + // pass "amdgpu-lower-module-lds", and also it required to be run only if + // "amdgpu-lower-module-lds" pass is enabled. + if (EnableLDSReplaceWithPointer) + addPass(createAMDGPUReplaceLDSUseWithPointerPass()); + addPass(createAMDGPULowerModuleLDSPass()); + } if (TM.getOptLevel() > CodeGenOpt::None) { addPass(createInferAddressSpacesPass()); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -81,6 +81,7 @@ AMDGPUPropagateAttributes.cpp AMDGPURegBankCombiner.cpp AMDGPURegisterBankInfo.cpp + AMDGPUReplaceLDSUseWithPointer.cpp AMDGPURewriteOutArguments.cpp AMDGPUSubtarget.cpp AMDGPUTargetMachine.cpp diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h @@ -14,6 +14,8 @@ #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H #include "AMDGPU.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/IR/Constants.h" namespace llvm { @@ -21,6 +23,24 @@ namespace AMDGPU { +/// Collect reachable callees for each kernel defined in the module \p M and +/// return collected callees at \p KernelToCallees. +void collectReachableCallees( + Module &M, + DenseMap> &KernelToCallees); + +/// For the given LDS global \p GV, visit all its users and collect all +/// non-kernel functions within which \p GV is used and return collected list of +/// such non-kernel functions. +SmallPtrSet collectNonKernelAccessorsOfLDS(GlobalVariable *GV); + +/// Collect all the instructions where user \p U belongs to. \p U could be +/// instruction itself or it could be a constant expression which is used within +/// an instruction. If \p CollectKernelInsts is true, collect instructions only +/// from kernels, otherwise collect instructions only from non-kernel functions. +DenseMap> +getFunctionToInstsMap(User *U, bool CollectKernelInsts); + bool isKernelCC(const Function *Func); Align getAlign(DataLayout const &DL, const GlobalVariable *GV); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp @@ -12,7 +12,9 @@ #include "AMDGPULDSUtils.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/CallGraph.h" #include "llvm/IR/Constants.h" #include "llvm/IR/ReplaceConstant.h" @@ -22,6 +24,189 @@ namespace AMDGPU { +// An helper class for collecting all reachable callees for each kernel defined +// within the module. +class CollectReachableCallees { + Module &M; + CallGraph CG; + SmallPtrSet AddressTakenFunctions; + + // Collect all address taken functions within the module. + void collectAddressTakenFunctions() { + auto *ECNode = CG.getExternalCallingNode(); + + for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) { + auto *CGN = GI->second; + auto *F = CGN->getFunction(); + if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F)) + continue; + AddressTakenFunctions.insert(CGN); + } + } + + // For a given caller node, collect all reachable callee nodes. + SmallPtrSet collectCGNodes(CallGraphNode *CGN) { + SmallPtrSet CGNodes; + + for (scc_iterator I = scc_begin(CGN); !I.isAtEnd(); ++I) { + const std::vector &SCC = *I; + assert(!SCC.empty() && "SCC with no functions?"); + for (auto *CGNode : SCC) + CGNodes.insert(CGNode); + } + + return CGNodes; + } + + // For given kernel, collect all its reachable non-kernel functions. + SmallPtrSet collectReachableCallees(Function *K) { + SmallPtrSet ReachableCallees; + + // Call graph node which represents this kernel. + auto *KCGN = CG[K]; + + // Collect all reachable call graph nodes from the node representing this + // kernel. + SmallPtrSet CGNodes = collectCGNodes(KCGN); + + // Go through collected reachable nodes, visit all thier call sites, if the + // call site is direct, add corresponding callee to reachable callee set, if + // it is indirect, resolve the indirect call site to potential reachable + // callees, add them to reachable callee set, and repeat the process for the + // newly added potential callee nodes. + // + // FIXME: Need to handle bit-casted function pointers. + // + SmallVector CGNStack(CGNodes.begin(), CGNodes.end()); + SmallPtrSet VisitedCGNodes; + while (!CGNStack.empty()) { + auto *CGN = CGNStack.pop_back_val(); + + if (!VisitedCGNodes.insert(CGN).second) + continue; + + for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) { + auto *RCB = cast(GI->first.getValue()); + auto *RCGN = GI->second; + + if (auto *DCallee = RCGN->getFunction()) { + ReachableCallees.insert(DCallee); + } else if (RCB->isIndirectCall()) { + auto *RCBFTy = RCB->getFunctionType(); + for (auto *ACGN : AddressTakenFunctions) { + auto *ACallee = ACGN->getFunction(); + if (ACallee->getFunctionType() == RCBFTy) { + ReachableCallees.insert(ACallee); + SmallPtrSet IGCNNodes = collectCGNodes(ACGN); + for (auto *IGCN : IGCNNodes) + CGNStack.push_back(IGCN); + } + } + } + } + } + + return ReachableCallees; + } + +public: + explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) { + // Collect address taken functions. + collectAddressTakenFunctions(); + } + + void collectReachableCallees( + DenseMap> &KernelToCallees) { + // Collect reachable callee set for each kernel defined in the module. + for (Function &F : M.functions()) { + if (!AMDGPU::isKernelCC(&F)) + continue; + Function *K = &F; + KernelToCallees[K] = collectReachableCallees(K); + } + } +}; + +void collectReachableCallees( + Module &M, + DenseMap> &KernelToCallees) { + CollectReachableCallees CRC{M}; + CRC.collectReachableCallees(KernelToCallees); +} + +SmallPtrSet collectNonKernelAccessorsOfLDS(GlobalVariable *GV) { + SmallPtrSet LDSAccessors; + SmallVector UserStack(GV->users()); + SmallPtrSet VisitedUsers; + + while (!UserStack.empty()) { + auto *U = UserStack.pop_back_val(); + + // `U` is already visited? continue to next one. + if (!VisitedUsers.insert(U).second) + continue; + + // `U` is a global variable which is initialized with LDS. Ignore LDS. + if (isa(U)) + return SmallPtrSet(); + + // Recursively explore constant users. + if (isa(U)) { + append_range(UserStack, U->users()); + continue; + } + + // `U` should be an instruction, if it belongs to a non-kernel function F, + // then collect F. + Function *F = cast(U)->getFunction(); + if (!AMDGPU::isKernelCC(F)) + LDSAccessors.insert(F); + } + + return LDSAccessors; +} + +DenseMap> +getFunctionToInstsMap(User *U, bool CollectKernelInsts) { + DenseMap> FunctionToInsts; + SmallVector UserStack; + SmallPtrSet VisitedUsers; + + UserStack.push_back(U); + + while (!UserStack.empty()) { + auto *UU = UserStack.pop_back_val(); + + if (!VisitedUsers.insert(UU).second) + continue; + + if (isa(UU)) + continue; + + if (isa(UU)) { + append_range(UserStack, UU->users()); + continue; + } + + auto *I = cast(UU); + Function *F = I->getFunction(); + if (CollectKernelInsts) { + if (!AMDGPU::isKernelCC(F)) { + continue; + } + } else { + if (AMDGPU::isKernelCC(F)) { + continue; + } + } + + FunctionToInsts.insert(std::make_pair(F, SmallPtrSet())); + FunctionToInsts[F].insert(I); + } + + return FunctionToInsts; +} + bool isKernelCC(const Function *Func) { return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv()); } diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -42,6 +42,7 @@ ; GCN-O0-NEXT: Inliner for always_inline functions ; GCN-O0-NEXT: A No-Op Barrier Pass ; GCN-O0-NEXT: Lower OpenCL enqueued blocks +; GCN-O0-NEXT: Replace within non-kernel function use of LDS with pointer ; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O0-NEXT: FunctionPass Manager ; GCN-O0-NEXT: Dominator Tree Construction @@ -192,6 +193,7 @@ ; GCN-O1-NEXT: Inliner for always_inline functions ; GCN-O1-NEXT: A No-Op Barrier Pass ; GCN-O1-NEXT: Lower OpenCL enqueued blocks +; GCN-O1-NEXT: Replace within non-kernel function use of LDS with pointer ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Infer address spaces @@ -438,6 +440,7 @@ ; GCN-O1-OPTS-NEXT: Inliner for always_inline functions ; GCN-O1-OPTS-NEXT: A No-Op Barrier Pass ; GCN-O1-OPTS-NEXT: Lower OpenCL enqueued blocks +; GCN-O1-OPTS-NEXT: Replace within non-kernel function use of LDS with pointer ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Infer address spaces @@ -717,6 +720,7 @@ ; GCN-O2-NEXT: Inliner for always_inline functions ; GCN-O2-NEXT: A No-Op Barrier Pass ; GCN-O2-NEXT: Lower OpenCL enqueued blocks +; GCN-O2-NEXT: Replace within non-kernel function use of LDS with pointer ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Infer address spaces @@ -997,6 +1001,7 @@ ; GCN-O3-NEXT: Inliner for always_inline functions ; GCN-O3-NEXT: A No-Op Barrier Pass ; GCN-O3-NEXT: Lower OpenCL enqueued blocks +; GCN-O3-NEXT: Replace within non-kernel function use of LDS with pointer ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Infer address spaces diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-diamond-shape.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-diamond-shape.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-diamond-shape.ll @@ -0,0 +1,79 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; The lds global @lds_used_within_func is used within non-kernel function @func_uses_lds +; which is recheable from kernel @kernel_reaches_lds, hence pointer replacement takes place +; for @lds_used_within_func. +; + +; Original LDS should exist. +; CHECK: @lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4 + +; Pointer should be created. +; CHECK: @lds_used_within_func.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Pointer replacement code should be added. +define internal void @func_uses_lds() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_func.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_func, i32 0, i32 0 + ret void +} + +; No change +define internal void @func_does_not_use_lds_3() { +; CHECK-LABEL: entry: +; CHECK: call void @func_uses_lds() +; CHECK: ret void +entry: + call void @func_uses_lds() + ret void +} + +; No change +define internal void @func_does_not_use_lds_2() { +; CHECK-LABEL: entry: +; CHECK: call void @func_uses_lds() +; CHECK: ret void +entry: + call void @func_uses_lds() + ret void +} + +; No change +define internal void @func_does_not_use_lds_1() { +; CHECK-LABEL: entry: +; CHECK: call void @func_does_not_use_lds_2() +; CHECK: call void @func_does_not_use_lds_3() +; CHECK: ret void +entry: + call void @func_does_not_use_lds_2() + call void @func_does_not_use_lds_3() + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel_reaches_lds() { +; CHECK-LABEL: entry: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_func to i16), i16 addrspace(3)* @lds_used_within_func.ptr, align 2 +; CHECK: call void @func_does_not_use_lds_1() +; CHECK: ret void +entry: + call void @func_does_not_use_lds_1() + ret void +} + +; No change here since this kernel does not reach @func_uses_lds which uses lds. +define protected amdgpu_kernel void @kernel_does_not_reach_lds() { +; CHECK-LABEL: entry: +; CHECK: ret void +entry: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-selected_functions.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-selected_functions.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-selected_functions.ll @@ -0,0 +1,103 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; There are three lds globals defined here, and these three lds are used respectively within +; three non-kernel functions. There are three kernels, which call two of the non-kernel functions. +; Hence pointer replacement should take place for all three lds, and pointer initialization within +; kernel should selectively happen depending on which lds is reachable from the kernel. +; + +; Original LDS should exist. +; CHECK: @lds_used_within_function_1 = internal addrspace(3) global [1 x i32] undef, align 4 +; CHECK: @lds_used_within_function_2 = internal addrspace(3) global [2 x i32] undef, align 4 +; CHECK: @lds_used_within_function_3 = internal addrspace(3) global [3 x i32] undef, align 4 +@lds_used_within_function_1 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds_used_within_function_2 = internal addrspace(3) global [2 x i32] undef, align 4 +@lds_used_within_function_3 = internal addrspace(3) global [3 x i32] undef, align 4 + +; Pointers should be created. +; CHECK: @lds_used_within_function_1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds_used_within_function_2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds_used_within_function_3.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Pointer replacement code should be added. +define internal void @function_3() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [3 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [3 x i32], [3 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [3 x i32], [3 x i32] addrspace(3)* @lds_used_within_function_3, i32 0, i32 0 + ret void +} + +; Pointer replacement code should be added. +define internal void @function_2() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [2 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @lds_used_within_function_2, i32 0, i32 0 + ret void +} + +; Pointer replacement code should be added. +define internal void @function_1() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [1 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_used_within_function_1, i32 0, i32 0 + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel_calls_function_3_and_1() { +; CHECK-LABEL: entry: +; CHECK: store i16 ptrtoint ([3 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2 +; CHECK: store i16 ptrtoint ([1 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2 +; CHECK: call void @function_3() +; CHECK: call void @function_1() +; CHECK: ret void +entry: + call void @function_3() + call void @function_1() + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel_calls_function_2_and_3() { +; CHECK-LABEL: entry: +; CHECK: store i16 ptrtoint ([3 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2 +; CHECK: store i16 ptrtoint ([2 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2 +; CHECK: call void @function_2() +; CHECK: call void @function_3() +; CHECK: ret void +entry: + call void @function_2() + call void @function_3() + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel_calls_function_1_and_2() { +; CHECK-LABEL: entry: +; CHECK: store i16 ptrtoint ([2 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2 +; CHECK: store i16 ptrtoint ([1 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2 +; CHECK: call void @function_1() +; CHECK: call void @function_2() +; CHECK: ret void +entry: + call void @function_1() + call void @function_2() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll @@ -0,0 +1,33 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; LDS global @lds is used within non-kernel function @f0, and @f0 is reachable +; from kernel @kern, but since @lds is also used in global scope as an initializer +; to @global_var, pointer replacement does not take place. +; + +; CHECK: @lds = internal addrspace(3) global [4 x i32] undef, align 4 +; CHECK: @global_var = addrspace(1) global float* addrspacecast (float addrspace(3)* bitcast ([4 x i32] addrspace(3)* @lds to float addrspace(3)*) to float*), align 8 +@lds = internal addrspace(3) global [4 x i32] undef, align 4 +@global_var = addrspace(1) global float* addrspacecast ([4 x i32] addrspace(3)* @lds to float*), align 8 + +; CHECK-NOT: @@lds.ptr + +define void @f0() { +; CHECK-LABEL: entry: +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @k0() { +; CHECK-LABEL: entry: +; CHECK: call void @f0() +; CHECK: ret void +entry: + call void @f0() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll @@ -0,0 +1,30 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; We do not know what to do with inline asm call, we ignore it, hence pointer replacement for +; @used_only_within_func does not take place. +; + +; CHECK: @used_only_within_func = addrspace(3) global [4 x i32] undef, align 4 +@used_only_within_func = addrspace(3) global [4 x i32] undef, align 4 + +; CHECK-NOT: @used_only_within_func.ptr + +define void @f0(i32 %x) { +; CHECK-LABEL: entry: +; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_func, i32 0, i32 0) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_func, i32 0, i32 0) to i32*) to i64)) to i32*), align 4 +; CHECK: ret void +entry: + store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + ret void +} + +define amdgpu_kernel void @k0() { +; CHECK-LABEL: entry: +; CHECK: call i32 asm "s_mov_b32 $0, 0", "=s"() +; CHECK: ret void +entry: + call i32 asm "s_mov_b32 $0, 0", "=s"() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll @@ -0,0 +1,25 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION ; +; +; LDS global @used_only_within_kern is used only within kernel @k0, hence pointer replacement +; does not take place for @used_only_within_kern. +; + +; CHECK: @used_only_within_kern = addrspace(3) global [4 x i32] undef, align 4 +@used_only_within_kern = addrspace(3) global [4 x i32] undef, align 4 + +; CHECK-NOT: @used_only_within_kern.ptr + +define amdgpu_kernel void @k0() { +; CHECK-LABEL: entry: +; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_kern, i32 0, i32 0) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_kern, i32 0, i32 0) to i32*) to i64)) to i32*), align 4 +; CHECK: %mul = mul i32 %ld, 2 +; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_kern, i32 0, i32 0) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @used_only_within_kern, i32 0, i32 0) to i32*) to i64)) to i32*), align 4 +; CHECK: ret void +entry: + %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + %mul = mul i32 %ld, 2 + store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll @@ -0,0 +1,28 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION ; +; +; LDS global @not-reachable-lds is used within non-kernel function @f0, but @f0 is *not* +; reachable from kernel @k, hence pointer replacement does not take place. +; + +; CHECK: @not-reachable-lds = internal addrspace(3) global [4 x i32] undef, align 4 +@not-reachable-lds = internal addrspace(3) global [4 x i32] undef, align 4 + +; CHECK-NOT: @not-reachable-lds.ptr + +define internal void @f0() { +; CHECK-LABEL: entry: +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @not-reachable-lds, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @not-reachable-lds, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @k0() { +; CHECK-LABEL: entry: +; CHECK: ret void +entry: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll @@ -0,0 +1,31 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION ; +; +; LDS global @small_lds is used within non-kernel function @f0, and @f0 is reachable +; from kernel @k0, but since @small_lds too small for pointer replacement, pointer +; replacement does not take place. +; + +; CHECK: @small_lds = addrspace(3) global i8 undef, align 1 +@small_lds = addrspace(3) global i8 undef, align 1 + +; CHECK-NOT: @small_lds.ptr + +define void @f0() { +; CHECK-LABEL: entry: +; CHECK: store i8 1, i8 addrspace(3)* @small_lds, align 1 +; CHECK: ret void +entry: + store i8 1, i8 addrspace(3)* @small_lds, align 1 + ret void +} + +define amdgpu_kernel void @k0() { +; CHECK-LABEL: entry: +; CHECK: call void @f0() +; CHECK: ret void +entry: + call void @f0() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-diamond-shape.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-diamond-shape.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-diamond-shape.ll @@ -0,0 +1,86 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; The lds global @lds_used_within_func is used within non-kernel function @func_uses_lds +; which is *indirectly* recheable from kernel @kernel_reaches_lds, hence pointer replacement +; takes place for @lds_used_within_func. + +; Original LDS should exit. +; CHECK: @lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4 + +; Function pointer should exist as it is. +; CHECK: @ptr_to_func = internal local_unnamed_addr externally_initialized global void ()* @func_uses_lds, align 8 +@ptr_to_func = internal local_unnamed_addr externally_initialized global void ()* @func_uses_lds, align 8 + +; Pointer should be created. +; CHECK: @lds_used_within_func.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Pointer replacement code should be added. +define internal void @func_uses_lds() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_func.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_func, i32 0, i32 0 + ret void +} + +; No change +define internal void @func_does_not_use_lds_3() { +; CHECK-LABEL: entry: +; CHECK: %fptr = load void ()*, void ()** @ptr_to_func, align 8 +; CHECK: call void %fptr() +; CHECK: ret void +entry: + %fptr = load void ()*, void ()** @ptr_to_func, align 8 + call void %fptr() + ret void +} + +; No change +define internal void @func_does_not_use_lds_2() { +; CHECK-LABEL: entry: +; CHECK: %fptr = load void ()*, void ()** @ptr_to_func, align 8 +; CHECK: call void %fptr() +; CHECK: ret void +entry: + %fptr = load void ()*, void ()** @ptr_to_func, align 8 + call void %fptr() + ret void +} + +; No change +define internal void @func_does_not_use_lds_1() { +; CHECK-LABEL: entry: +; CHECK: call void @func_does_not_use_lds_2() +; CHECK: call void @func_does_not_use_lds_3() +; CHECK: ret void +entry: + call void @func_does_not_use_lds_2() + call void @func_does_not_use_lds_3() + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel_reaches_lds() { +; CHECK-LABEL: entry: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_func to i16), i16 addrspace(3)* @lds_used_within_func.ptr, align 2 +; CHECK: call void @func_does_not_use_lds_1() +; CHECK: ret void +entry: + call void @func_does_not_use_lds_1() + ret void +} + +; No change here since this kernel does not reach @func_uses_lds which uses lds. +define protected amdgpu_kernel void @kernel_does_not_reach_lds() { +; CHECK-LABEL: entry: +; CHECK: ret void +entry: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-selected_functions.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-selected_functions.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-selected_functions.ll @@ -0,0 +1,124 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; There are three lds globals defined here, and these three lds are used respectively within +; three non-kernel functions. There are three kernels, which *indirectly* call two of the +; non-kernel functions. Hence pointer replacement should take place for all three lds, and +; pointer initialization within kernel should selectively happen depending on which lds is +; reachable from the kernel. +; + +; Original LDS should exist. +; CHECK: @lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4 +; CHECK: @lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4 +; CHECK: @lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4 + +; Function pointers should exist. +; CHECK: @ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (float)* @function_1, align 8 +; CHECK: @ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8 +; CHECK: @ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i8)* @function_3, align 8 +@ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (float)* @function_1, align 8 +@ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8 +@ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i8)* @function_3, align 8 + +; Pointers should be created. +; CHECK: @lds_used_within_function_1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds_used_within_function_2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds_used_within_function_3.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Pointer replacement code should be added. +define internal void @function_3(i8 %c) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_3, i32 0, i32 0 + ret void +} + +; Pointer replacement code should be added. +define internal void @function_2(i16 %i) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_2, i32 0, i32 0 + ret void +} + +; Pointer replacement code should be added. +define internal void @function_1(float %f) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_1, i32 0, i32 0 + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel_calls_function_3_and_1() { +; CHECK-LABEL: entry: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2 +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2 +; CHECK: %fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8 +; CHECK: %fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8 +; CHECK: call void %fptr3(i8 1) +; CHECK: call void %fptr1(float 2.000000e+00) +; CHECK: ret void +entry: + %fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8 + %fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8 + call void %fptr3(i8 1) + call void %fptr1(float 2.0) + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel_calls_function_2_and_3() { +; CHECK-LABEL: entry: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2 +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2 +; CHECK: %fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8 +; CHECK: %fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8 +; CHECK: call void %fptr2(i16 3) +; CHECK: call void %fptr3(i8 4) +; CHECK: ret void +entry: + %fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8 + %fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8 + call void %fptr2(i16 3) + call void %fptr3(i8 4) + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel_calls_function_1_and_2() { +; CHECK-LABEL: entry: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2 +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2 +; CHECK: %fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8 +; CHECK: %fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8 +; CHECK: call void %fptr1(float 5.000000e+00) +; CHECK: call void %fptr2(i16 6) +; CHECK: ret void +entry: + %fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8 + %fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8 + call void %fptr1(float 5.0) + call void %fptr2(i16 6) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-signature-match.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-signature-match.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-signature-match.ll @@ -0,0 +1,85 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; There are three lds globals defined here, and these three lds are used respectively within +; three non-kernel functions. There is one kernel which *indirectly* calls one of the non-kernel +; functions. But since all the three non-kernel functions have same signature, all three +; non-kernel functions are resolved as potential callees for indirect call-site. Hence we land-up +; pointer replacement for three lds globals. +; + +; Original LDS should exist. +; CHECK: @lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4 +; CHECK: @lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4 +; CHECK: @lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4 + +; Function pointers should exist. +; CHECK: @ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (i16)* @function_1, align 8 +; CHECK: @ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8 +; CHECK: @ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i16)* @function_3, align 8 +@ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (i16)* @function_1, align 8 +@ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8 +@ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i16)* @function_3, align 8 + +; Pointers should be created. +; CHECK: @lds_used_within_function_1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds_used_within_function_2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds_used_within_function_3.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Pointer replacement code should be added. +define internal void @function_3(i16 %i) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_3, i32 0, i32 0 + ret void +} + +; Pointer replacement code should be added. +define internal void @function_2(i16 %i) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_2, i32 0, i32 0 + ret void +} + +; Pointer replacement code should be added. +define internal void @function_1(i16 %i) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_1, i32 0, i32 0 + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel_indirectly_calls_function_1() { +; CHECK-LABEL: entry: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2 +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2 +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2 +; CHECK: %fptr1 = load void (i16)*, void (i16)** @ptr_to_func1, align 8 +; CHECK: call void %fptr1(i16 6) +; CHECK: ret void +entry: + %fptr1 = load void (i16)*, void (i16)** @ptr_to_func1, align 8 + call void %fptr1(i16 6) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-multiple-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-multiple-lds.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-multiple-lds.ll @@ -0,0 +1,57 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; There are three lds globals defined here, and these three lds are used within a single +; non-kernel function, and this non-kernel function is reachable from kernel. Hence pointer +; replacement is required for all three lds globals. +; + +; Original LDS should exist. +; CHECK: @lds1 = internal addrspace(3) global [1 x i32] undef, align 4 +; CHECK: @lds2 = internal addrspace(3) global [2 x i32] undef, align 4 +; CHECK: @lds3 = internal addrspace(3) global [3 x i32] undef, align 4 +@lds1 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds2 = internal addrspace(3) global [2 x i32] undef, align 4 +@lds3 = internal addrspace(3) global [3 x i32] undef, align 4 + +; Pointers should be created. +; CHECK: @lds1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds3.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Pointer replacement code should be added. +define internal void @function() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds3.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [3 x i32] addrspace(3)* +; CHECK: %3 = load i16, i16 addrspace(3)* @lds2.ptr, align 2 +; CHECK: %4 = getelementptr i8, i8 addrspace(3)* null, i16 %3 +; CHECK: %5 = bitcast i8 addrspace(3)* %4 to [2 x i32] addrspace(3)* +; CHECK: %6 = load i16, i16 addrspace(3)* @lds1.ptr, align 2 +; CHECK: %7 = getelementptr i8, i8 addrspace(3)* null, i16 %6 +; CHECK: %8 = bitcast i8 addrspace(3)* %7 to [1 x i32] addrspace(3)* +; CHECK: %gep1 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %8, i32 0, i32 0 +; CHECK: %gep2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* %5, i32 0, i32 0 +; CHECK: %gep3 = getelementptr inbounds [3 x i32], [3 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep1 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds1, i32 0, i32 0 + %gep2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @lds2, i32 0, i32 0 + %gep3 = getelementptr inbounds [3 x i32], [3 x i32] addrspace(3)* @lds3, i32 0, i32 0 + ret void +} + +; Pointer initialization code shoud be added; +define protected amdgpu_kernel void @kernel() { +; CHECK-LABEL: entry: +; CHECK: store i16 ptrtoint ([3 x i32] addrspace(3)* @lds3 to i16), i16 addrspace(3)* @lds3.ptr, align 2 +; CHECK: store i16 ptrtoint ([2 x i32] addrspace(3)* @lds2 to i16), i16 addrspace(3)* @lds2.ptr, align 2 +; CHECK: store i16 ptrtoint ([1 x i32] addrspace(3)* @lds1 to i16), i16 addrspace(3)* @lds1.ptr, align 2 +; CHECK: call void @function() +; CHECK: ret void +entry: + call void @function() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-same-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-same-lds.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-same-lds.ll @@ -0,0 +1,44 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; There is one lds global defined here, and this lds is used within a single non-kernel +; function multiple times, and this non-kernel function is reachable from kernel. Hence +; pointer takes place. But important note is - store-to/load-from pointer should happen +; only once irrespective of number of uses. +; + +; Original LDS should exist. +; CHECK: @lds1 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds1 = internal addrspace(3) global [1 x i32] undef, align 4 + +; Pointers should be created. +; CHECK: @lds1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Pointer replacement code should be added. +define internal void @function() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds1.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [1 x i32] addrspace(3)* +; CHECK: %gep1 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: %gep2 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: %gep3 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep1 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds1, i32 0, i32 0 + %gep2 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds1, i32 0, i32 0 + %gep3 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds1, i32 0, i32 0 + ret void +} + +; Pointer initialization code shoud be added; +define protected amdgpu_kernel void @kernel() { +; CHECK-LABEL: entry: +; CHECK: store i16 ptrtoint ([1 x i32] addrspace(3)* @lds1 to i16), i16 addrspace(3)* @lds1.ptr, align 2 +; CHECK: call void @function() +; CHECK: ret void +entry: + call void @function() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll @@ -0,0 +1,45 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; There is one lds global defined here, and this lds is used within a single non-kernel +; function, as an operand of nested constant expression, and this non-kernel function is +; reachable from kernel. Hence nested constant expression should to be converted into a +; series of instructons and pointer replacement should take place. +; + +; Original LDS should exist. +; CHECK: @used_only_within_func = addrspace(3) global [4 x i32] undef, align 4 +@used_only_within_func = addrspace(3) global [4 x i32] undef, align 4 + +; Pointers should be created. +; CHECK: @used_only_within_func.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Pointer replacement code should be added. +define void @f0(i32 %x) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @used_only_within_func.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: %4 = addrspacecast i32 addrspace(3)* %3 to i32* +; CHECK: %5 = ptrtoint i32* %4 to i64 +; CHECK: %6 = add i64 %5, %5 +; CHECK: %7 = inttoptr i64 %6 to i32* +; CHECK: store i32 %x, i32* %7, align 4 +; CHECK: ret void +entry: + store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + ret void +} + +; Pointer initialization code shoud be added +define amdgpu_kernel void @k0() { +; CHECK-LABEL: entry: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @used_only_within_func to i16), i16 addrspace(3)* @used_only_within_func.ptr, align 2 +; CHECK: call void @f0(i32 0) +; CHECK: ret void +entry: + call void @f0(i32 0) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr2.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr2.ll @@ -0,0 +1,49 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; There is one lds global defined here, and this lds is used within a single non-kernel +; function, as an operand of nested constant expression, and this non-kernel function is +; reachable from kernel. Hence nested constant expression should to be converted into a +; series of instructons and pointer replacement should take place. But, important note +; is - only constant expression operands which uses lds should be converted into +; instructions, other constant expression operands which do not use lds should be left +; untouched. +; + +; Original LDS should exist. +; CHECK: @lds_used_within_function = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function = internal addrspace(3) global [4 x i32] undef, align 4 + +; Non-LDS global should exist as it is. +; CHECK: @global_var = internal addrspace(1) global [4 x i32] undef, align 4 +@global_var = internal addrspace(1) global [4 x i32] undef, align 4 + +; Pointer should be created. +; CHECK: @lds_used_within_function.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Pointer replacement code should be added. +define internal void @function() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds_used_within_function.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 2 +; CHECK: %4 = addrspacecast i32 addrspace(3)* %3 to i32* +; CHECK: %5 = ptrtoint i32* %4 to i32 +; CHECK: %6 = add i32 %5, ptrtoint (i32 addrspace(1)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(1)* @global_var, i32 0, i32 2) to i32) +; CHECK: ret void +entry: + %0 = add i32 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function, i32 0, i32 2) to i32*) to i32), ptrtoint (i32 addrspace(1)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(1)* @global_var, i32 0, i32 2) to i32) + ret void +} + +; Pointer initialization code shoud be added +define protected amdgpu_kernel void @kernel() { +; CHECK-LABEL: entry: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function to i16), i16 addrspace(3)* @lds_used_within_function.ptr, align 2 +; CHECK: call void @function() +; CHECK: ret void +entry: + call void @function() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-phi-inst.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-phi-inst.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-phi-inst.ll @@ -0,0 +1,84 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s + +; DESCRIPTION: +; +; Replace lds globals used within phi instruction. +; + +; Original LDS should exist. +; CHECK: @lds.1 = addrspace(3) global i32 undef, align 4 +; CHECK: @lds.2 = addrspace(3) global i32 undef, align 4 +@lds.1 = addrspace(3) global i32 undef, align 4 +@lds.2 = addrspace(3) global i32 undef, align 4 + +; Pointers should be created. +; CHECK: @lds.1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +; CHECK: @lds.2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +define void @f0(i32 %arg) { +; CHECK-LABEL: bb: +; CHECK: %0 = load i16, i16 addrspace(3)* @lds.2.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to i32 addrspace(3)* +; CHECK: %3 = load i16, i16 addrspace(3)* @lds.1.ptr, align 2 +; CHECK: %4 = getelementptr i8, i8 addrspace(3)* null, i16 %3 +; CHECK: %5 = bitcast i8 addrspace(3)* %4 to i32 addrspace(3)* +; CHECK: %id = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK: %my.tmp = sub i32 %id, %arg +; CHECK: br label %bb1 +bb: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %my.tmp = sub i32 %id, %arg + br label %bb1 + +; CHECK-LABEL: bb1: +; CHECK: %lsr.iv = phi i32 [ undef, %bb ], [ %my.tmp2, %Flow ] +; CHECK: %6 = icmp ne i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), %5 +; CHECK: %lsr.iv.next = add i32 %lsr.iv, 1 +; CHECK: %cmp0 = icmp slt i32 %lsr.iv.next, 0 +; CHECK: br i1 %cmp0, label %bb4, label %Flow +bb1: + %lsr.iv = phi i32 [ undef, %bb ], [ %my.tmp2, %Flow ] + %lsr.iv.next = add i32 %lsr.iv, 1 + %cmp0 = icmp slt i32 %lsr.iv.next, 0 + br i1 %cmp0, label %bb4, label %Flow + +; CHECK-LABEL: bb4: +; CHECK: %load = load volatile i32, i32 addrspace(1)* undef, align 4 +; CHECK: %cmp1 = icmp sge i32 %my.tmp, %load +; CHECK: br label %Flow +bb4: + %load = load volatile i32, i32 addrspace(1)* undef, align 4 + %cmp1 = icmp sge i32 %my.tmp, %load + br label %Flow + +; CHECK-LABEL: Flow: +; CHECK: %my.tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] +; CHECK: %my.tmp3 = phi i32 addrspace(3)* [ %2, %bb4 ], [ %5, %bb1 ] +; CHECK: %my.tmp4 = phi i1 [ %cmp1, %bb4 ], [ %6, %bb1 ] +; CHECK: br i1 %my.tmp4, label %bb9, label %bb1 +Flow: + %my.tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] + %my.tmp3 = phi i32 addrspace(3)* [@lds.2, %bb4 ], [ @lds.1, %bb1 ] + %my.tmp4 = phi i1 [ %cmp1, %bb4 ], [ icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds.1), %bb1 ] + br i1 %my.tmp4, label %bb9, label %bb1 + +; CHECK-LABEL: bb9: +; CHECK: store volatile i32 7, i32 addrspace(3)* undef, align 4 +; CHECK: ret void +bb9: + store volatile i32 7, i32 addrspace(3)* undef + ret void +} + +; CHECK-LABEL: @k0 +; CHECK: store i16 ptrtoint (i32 addrspace(3)* @lds.2 to i16), i16 addrspace(3)* @lds.2.ptr, align 2 +; CHECK: store i16 ptrtoint (i32 addrspace(3)* @lds.1 to i16), i16 addrspace(3)* @lds.1.ptr, align 2 +; CHECK: call void @f0(i32 %arg) +; CHECK: ret void +define amdgpu_kernel void @k0(i32 %arg) { + call void @f0(i32 %arg) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x()