diff --git a/llvm/include/llvm/IR/ReplaceConstant.h b/llvm/include/llvm/IR/ReplaceConstant.h --- a/llvm/include/llvm/IR/ReplaceConstant.h +++ b/llvm/include/llvm/IR/ReplaceConstant.h @@ -21,40 +21,6 @@ template class ArrayRef; class Constant; -class ConstantExpr; -class Instruction; -class Use; -template class SmallPtrSetImpl; - -/// The given instruction \p I contains given constant expression \p CE as one -/// of its operands, possibly nested within constant expression trees. Convert -/// all reachable paths from contant expression operands of \p I to \p CE into -/// corresponding instructions, insert them before \p I, update operands of \p I -/// accordingly, and if required, return all such converted instructions at -/// \p Insts. -void convertConstantExprsToInstructions( - Instruction *I, ConstantExpr *CE, - SmallPtrSetImpl *Insts = nullptr); - -/// The given instruction \p I contains constant expression CE within the -/// constant expression trees of it`s constant expression operands, and -/// \p CEPaths holds all the reachable paths (to CE) from such constant -/// expression trees of \p I. Convert constant expressions within these paths -/// into corresponding instructions, insert them before \p I, update operands of -/// \p I accordingly, and if required, return all such converted instructions at -/// \p Insts. -void convertConstantExprsToInstructions( - Instruction *I, - std::map>> &CEPaths, - SmallPtrSetImpl *Insts = nullptr); - -/// Given an instruction \p I which uses given constant expression \p CE as -/// operand, either directly or nested within other constant expressions, return -/// all reachable paths from the constant expression operands of \p I to \p CE, -/// and return collected paths at \p CEPaths. -void collectConstantExprPaths( - Instruction *I, ConstantExpr *CE, - std::map>> &CEPaths); /// Replace constant expressions users of the given constants with /// instructions. Return whether anything was changed. diff --git a/llvm/lib/IR/ReplaceConstant.cpp b/llvm/lib/IR/ReplaceConstant.cpp --- a/llvm/lib/IR/ReplaceConstant.cpp +++ b/llvm/lib/IR/ReplaceConstant.cpp @@ -13,127 +13,11 @@ #include "llvm/IR/ReplaceConstant.h" #include "llvm/ADT/SetVector.h" -#include "llvm/ADT/SmallPtrSet.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/ValueMap.h" namespace llvm { -void convertConstantExprsToInstructions(Instruction *I, ConstantExpr *CE, - SmallPtrSetImpl *Insts) { - // Collect all reachable paths to CE from constant exprssion operands of I. - std::map>> CEPaths; - collectConstantExprPaths(I, CE, CEPaths); - - // Convert all constant expressions to instructions which are collected at - // CEPaths. - convertConstantExprsToInstructions(I, CEPaths, Insts); -} - -void convertConstantExprsToInstructions( - Instruction *I, - std::map>> &CEPaths, - SmallPtrSetImpl *Insts) { - ValueMap Visited; - - for (Use &U : I->operands()) { - // The operand U is either not a constant expression operand or the - // constant expression paths do not belong to U, ignore U. - if (!CEPaths.count(&U)) - continue; - - // If the instruction I is a PHI instruction, then fix the instruction - // insertion point to the entry of the incoming basic block for operand U. - auto *BI = I; - if (auto *Phi = dyn_cast(I)) { - BasicBlock *BB = Phi->getIncomingBlock(U); - BI = &(*(BB->getFirstInsertionPt())); - } - - // Go through all the paths associated with operand U, and convert all the - // constant expressions along all the paths to corresponding instructions. - auto *II = I; - auto &Paths = CEPaths[&U]; - for (auto &Path : Paths) { - for (auto *CE : Path) { - // Instruction which is equivalent to CE. - Instruction *NI = nullptr; - - if (!Visited.count(CE)) { - // CE is encountered first time, convert it into a corresponding - // instruction NI, and appropriately insert NI before the parent - // instruction. - NI = CE->getAsInstruction(BI); - - // Mark CE as visited by mapping CE to NI. - Visited[CE] = NI; - - // If required collect NI. - if (Insts) - Insts->insert(NI); - } else { - // We had already encountered CE, the correponding instruction already - // exist, use it to replace CE. - NI = Visited[CE]; - } - - assert(NI && "Expected an instruction corresponding to constant " - "expression."); - - // Replace all uses of constant expression CE by the corresponding - // instruction NI within the current parent instruction. - II->replaceUsesOfWith(CE, NI); - BI = II = NI; - } - } - } - - // Remove all converted constant expressions which are dead by now. - for (auto Item : Visited) - Item.first->removeDeadConstantUsers(); -} - -void collectConstantExprPaths( - Instruction *I, ConstantExpr *CE, - std::map>> &CEPaths) { - for (Use &U : I->operands()) { - // If the operand U is not a constant expression operand, then ignore it. - auto *CE2 = dyn_cast(U.get()); - if (!CE2) - continue; - - // Holds all reachable paths from CE2 to CE. - std::vector> Paths; - - // Collect all reachable paths from CE2 to CE. - std::vector Path{CE2}; - std::vector> Stack{Path}; - while (!Stack.empty()) { - std::vector TPath = Stack.back(); - Stack.pop_back(); - auto *CE3 = TPath.back(); - - if (CE3 == CE) { - Paths.push_back(TPath); - continue; - } - - for (auto &UU : CE3->operands()) { - if (auto *CE4 = dyn_cast(UU.get())) { - std::vector NPath(TPath.begin(), TPath.end()); - NPath.push_back(CE4); - Stack.push_back(NPath); - } - } - } - - // Associate all the collected paths with U, and save it. - if (!Paths.empty()) - CEPaths[&U] = Paths; - } -} - static bool isExpandableUser(User *U) { return isa(U) || isa(U); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -56,7 +56,6 @@ FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *); ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *); FunctionPass *createAMDGPURewriteOutArgumentsPass(); -ModulePass *createAMDGPUReplaceLDSUseWithPointerPass(); ModulePass *createAMDGPULowerModuleLDSPass(); FunctionPass *createSIModeRegisterPass(); FunctionPass *createGCNPreRAOptimizationsPass(); @@ -144,14 +143,6 @@ TargetMachine &TM; }; -void initializeAMDGPUReplaceLDSUseWithPointerPass(PassRegistry &); -extern char &AMDGPUReplaceLDSUseWithPointerID; - -struct AMDGPUReplaceLDSUseWithPointerPass - : PassInfoMixin { - PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); -}; - void initializeAMDGPULowerModuleLDSPass(PassRegistry &); extern char &AMDGPULowerModuleLDSID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp deleted file mode 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp +++ /dev/null @@ -1,648 +0,0 @@ -//===-- AMDGPUReplaceLDSUseWithPointer.cpp --------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This pass replaces all the uses of LDS within non-kernel functions by -// corresponding pointer counter-parts. -// -// The main motivation behind this pass is - to *avoid* subsequent LDS lowering -// pass from directly packing LDS (assume large LDS) into a struct type which -// would otherwise cause allocating huge memory for struct instance within every -// kernel. -// -// Brief sketch of the algorithm implemented in this pass is as below: -// -// 1. Collect all the LDS defined in the module which qualify for pointer -// replacement, say it is, LDSGlobals set. -// -// 2. Collect all the reachable callees for each kernel defined in the module, -// say it is, KernelToCallees map. -// -// 3. FOR (each global GV from LDSGlobals set) DO -// LDSUsedNonKernels = Collect all non-kernel functions which use GV. -// FOR (each kernel K in KernelToCallees map) DO -// ReachableCallees = KernelToCallees[K] -// ReachableAndLDSUsedCallees = -// SetIntersect(LDSUsedNonKernels, ReachableCallees) -// IF (ReachableAndLDSUsedCallees is not empty) THEN -// Pointer = Create a pointer to point-to GV if not created. -// Initialize Pointer to point-to GV within kernel K. -// ENDIF -// ENDFOR -// Replace all uses of GV within non kernel functions by Pointer. -// ENFOR -// -// LLVM IR example: -// -// Input IR: -// -// @lds = internal addrspace(3) global [4 x i32] undef, align 16 -// -// define internal void @f0() { -// entry: -// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds, -// i32 0, i32 0 -// ret void -// } -// -// define protected amdgpu_kernel void @k0() { -// entry: -// call void @f0() -// ret void -// } -// -// Output IR: -// -// @lds = internal addrspace(3) global [4 x i32] undef, align 16 -// @lds.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 -// -// define internal void @f0() { -// entry: -// %0 = load i16, i16 addrspace(3)* @lds.ptr, align 2 -// %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 -// %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* -// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, -// i32 0, i32 0 -// ret void -// } -// -// define protected amdgpu_kernel void @k0() { -// entry: -// store i16 ptrtoint ([4 x i32] addrspace(3)* @lds to i16), -// i16 addrspace(3)* @lds.ptr, align 2 -// call void @f0() -// ret void -// } -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "GCNSubtarget.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "Utils/AMDGPUMemoryUtils.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SetOperations.h" -#include "llvm/Analysis/CallGraph.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InlineAsm.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicsAMDGPU.h" -#include "llvm/IR/ReplaceConstant.h" -#include "llvm/InitializePasses.h" -#include "llvm/Pass.h" -#include "llvm/Support/Debug.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/ModuleUtils.h" -#include -#include - -#define DEBUG_TYPE "amdgpu-replace-lds-use-with-pointer" - -using namespace llvm; - -namespace { - -namespace AMDGPU { -/// Collect all the instructions where user \p U belongs to. \p U could be -/// instruction itself or it could be a constant expression which is used within -/// an instruction. If \p CollectKernelInsts is true, collect instructions only -/// from kernels, otherwise collect instructions only from non-kernel functions. -DenseMap> -getFunctionToInstsMap(User *U, bool CollectKernelInsts); - -SmallPtrSet collectNonKernelAccessorsOfLDS(GlobalVariable *GV); - -} // namespace AMDGPU - -class ReplaceLDSUseImpl { - Module &M; - LLVMContext &Ctx; - const DataLayout &DL; - Constant *LDSMemBaseAddr; - - DenseMap LDSToPointer; - DenseMap> LDSToNonKernels; - DenseMap> KernelToCallees; - DenseMap> KernelToLDSPointers; - DenseMap KernelToInitBB; - DenseMap> - FunctionToLDSToReplaceInst; - - // Collect LDS which requires their uses to be replaced by pointer. - std::vector collectLDSRequiringPointerReplace() { - // Collect LDS which requires module lowering. - std::vector LDSGlobals = - llvm::AMDGPU::findLDSVariablesToLower(M, nullptr); - - // Remove LDS which don't qualify for replacement. - llvm::erase_if(LDSGlobals, [&](GlobalVariable *GV) { - return shouldIgnorePointerReplacement(GV); - }); - - return LDSGlobals; - } - - // Returns true if uses of given LDS global within non-kernel functions should - // be keep as it is without pointer replacement. - bool shouldIgnorePointerReplacement(GlobalVariable *GV) { - // LDS whose size is very small and doesn't exceed pointer size is not worth - // replacing. - if (DL.getTypeAllocSize(GV->getValueType()) <= 2) - return true; - - // LDS which is not used from non-kernel function scope or it is used from - // global scope does not qualify for replacement. - LDSToNonKernels[GV] = AMDGPU::collectNonKernelAccessorsOfLDS(GV); - return LDSToNonKernels[GV].empty(); - - // FIXME: When GV is used within all (or within most of the kernels), then - // it does not make sense to create a pointer for it. - } - - // Insert new global LDS pointer which points to LDS. - GlobalVariable *createLDSPointer(GlobalVariable *GV) { - // LDS pointer which points to LDS is already created? Return it. - auto PointerEntry = LDSToPointer.insert(std::pair(GV, nullptr)); - if (!PointerEntry.second) - return PointerEntry.first->second; - - // We need to create new LDS pointer which points to LDS. - // - // Each CU owns at max 64K of LDS memory, so LDS address ranges from 0 to - // 2^16 - 1. Hence 16 bit pointer is enough to hold the LDS address. - auto *I16Ty = Type::getInt16Ty(Ctx); - GlobalVariable *LDSPointer = new GlobalVariable( - M, I16Ty, false, GlobalValue::InternalLinkage, UndefValue::get(I16Ty), - GV->getName() + Twine(".ptr"), nullptr, GlobalVariable::NotThreadLocal, - AMDGPUAS::LOCAL_ADDRESS); - - LDSPointer->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - LDSPointer->setAlignment(llvm::AMDGPU::getAlign(DL, LDSPointer)); - - // Mark that an associated LDS pointer is created for LDS. - LDSToPointer[GV] = LDSPointer; - - return LDSPointer; - } - - // Split entry basic block in such a way that only lane 0 of each wave does - // the LDS pointer initialization, and return newly created basic block. - BasicBlock *activateLaneZero(Function *K) { - // If the entry basic block of kernel K is already split, then return - // newly created basic block. - auto BasicBlockEntry = KernelToInitBB.insert(std::pair(K, nullptr)); - if (!BasicBlockEntry.second) - return BasicBlockEntry.first->second; - - // Split entry basic block of kernel K. - auto *EI = &(*(K->getEntryBlock().getFirstInsertionPt())); - IRBuilder<> Builder(EI); - - Value *Mbcnt = - Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, - {Builder.getInt32(-1), Builder.getInt32(0)}); - Value *Cond = Builder.CreateICmpEQ(Mbcnt, Builder.getInt32(0)); - Instruction *WB = cast( - Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {})); - - BasicBlock *NBB = SplitBlockAndInsertIfThen(Cond, WB, false)->getParent(); - - // Mark that the entry basic block of kernel K is split. - KernelToInitBB[K] = NBB; - - return NBB; - } - - // Within given kernel, initialize given LDS pointer to point to given LDS. - void initializeLDSPointer(Function *K, GlobalVariable *GV, - GlobalVariable *LDSPointer) { - // If LDS pointer is already initialized within K, then nothing to do. - auto PointerEntry = KernelToLDSPointers.insert( - std::pair(K, SmallPtrSet())); - if (!PointerEntry.second) - if (PointerEntry.first->second.contains(LDSPointer)) - return; - - // Insert instructions at EI which initialize LDS pointer to point-to LDS - // within kernel K. - // - // That is, convert pointer type of GV to i16, and then store this converted - // i16 value within LDSPointer which is of type i16*. - auto *EI = &(*(activateLaneZero(K)->getFirstInsertionPt())); - IRBuilder<> Builder(EI); - Builder.CreateStore(Builder.CreatePtrToInt(GV, Type::getInt16Ty(Ctx)), - LDSPointer); - - // Mark that LDS pointer is initialized within kernel K. - KernelToLDSPointers[K].insert(LDSPointer); - } - - // We have created an LDS pointer for LDS, and initialized it to point-to LDS - // within all relevant kernels. Now replace all the uses of LDS within - // non-kernel functions by LDS pointer. - void replaceLDSUseByPointer(GlobalVariable *GV, GlobalVariable *LDSPointer) { - SmallVector LDSUsers(GV->users()); - for (auto *U : LDSUsers) { - // When `U` is a constant expression, it is possible that same constant - // expression exists within multiple instructions, and within multiple - // non-kernel functions. Collect all those non-kernel functions and all - // those instructions within which `U` exist. - auto FunctionToInsts = - AMDGPU::getFunctionToInstsMap(U, false /*=CollectKernelInsts*/); - - for (const auto &FunctionToInst : FunctionToInsts) { - Function *F = FunctionToInst.first; - auto &Insts = FunctionToInst.second; - for (auto *I : Insts) { - // If `U` is a constant expression, then we need to break the - // associated instruction into a set of separate instructions by - // converting constant expressions into instructions. - SmallPtrSet UserInsts; - - if (U == I) { - // `U` is an instruction, conversion from constant expression to - // set of instructions is *not* required. - UserInsts.insert(I); - } else { - // `U` is a constant expression, convert it into corresponding set - // of instructions. - auto *CE = cast(U); - convertConstantExprsToInstructions(I, CE, &UserInsts); - } - - // Go through all the user instructions, if LDS exist within them as - // an operand, then replace it by replace instruction. - for (auto *II : UserInsts) { - auto *ReplaceInst = getReplacementInst(F, GV, LDSPointer); - II->replaceUsesOfWith(GV, ReplaceInst); - } - } - } - } - } - - // Create a set of replacement instructions which together replace LDS within - // non-kernel function F by accessing LDS indirectly using LDS pointer. - Value *getReplacementInst(Function *F, GlobalVariable *GV, - GlobalVariable *LDSPointer) { - // If the instruction which replaces LDS within F is already created, then - // return it. - auto LDSEntry = FunctionToLDSToReplaceInst.insert( - std::pair(F, DenseMap())); - if (!LDSEntry.second) { - auto ReplaceInstEntry = - LDSEntry.first->second.insert(std::pair(GV, nullptr)); - if (!ReplaceInstEntry.second) - return ReplaceInstEntry.first->second; - } - - // Get the instruction insertion point within the beginning of the entry - // block of current non-kernel function. - auto *EI = &(*(F->getEntryBlock().getFirstInsertionPt())); - IRBuilder<> Builder(EI); - - // Insert required set of instructions which replace LDS within F. - auto *V = Builder.CreateBitCast( - Builder.CreateGEP( - Builder.getInt8Ty(), LDSMemBaseAddr, - Builder.CreateLoad(LDSPointer->getValueType(), LDSPointer)), - GV->getType()); - - // Mark that the replacement instruction which replace LDS within F is - // created. - FunctionToLDSToReplaceInst[F][GV] = V; - - return V; - } - -public: - ReplaceLDSUseImpl(Module &M) - : M(M), Ctx(M.getContext()), DL(M.getDataLayout()) { - LDSMemBaseAddr = Constant::getIntegerValue( - PointerType::get(Type::getInt8Ty(M.getContext()), - AMDGPUAS::LOCAL_ADDRESS), - APInt(32, 0)); - } - - // Entry-point function which interface ReplaceLDSUseImpl with outside of the - // class. - bool replaceLDSUse(); - -private: - // For a given LDS from collected LDS globals set, replace its non-kernel - // function scope uses by pointer. - bool replaceLDSUse(GlobalVariable *GV); -}; - -// For given LDS from collected LDS globals set, replace its non-kernel function -// scope uses by pointer. -bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) { - // Holds all those non-kernel functions within which LDS is being accessed. - SmallPtrSet &LDSAccessors = LDSToNonKernels[GV]; - - // The LDS pointer which points to LDS and replaces all the uses of LDS. - GlobalVariable *LDSPointer = nullptr; - - // Traverse through each kernel K, check and if required, initialize the - // LDS pointer to point to LDS within K. - for (const auto &KernelToCallee : KernelToCallees) { - Function *K = KernelToCallee.first; - SmallPtrSet Callees = KernelToCallee.second; - - // Compute reachable and LDS used callees for kernel K. - set_intersect(Callees, LDSAccessors); - - // None of the LDS accessing non-kernel functions are reachable from - // kernel K. Hence, no need to initialize LDS pointer within kernel K. - if (Callees.empty()) - continue; - - // We have found reachable and LDS used callees for kernel K, and we need to - // initialize LDS pointer within kernel K, and we need to replace LDS use - // within those callees by LDS pointer. - // - // But, first check if LDS pointer is already created, if not create one. - LDSPointer = createLDSPointer(GV); - - // Initialize LDS pointer to point to LDS within kernel K. - initializeLDSPointer(K, GV, LDSPointer); - } - - // We have not found reachable and LDS used callees for any of the kernels, - // and hence we have not created LDS pointer. - if (!LDSPointer) - return false; - - // We have created an LDS pointer for LDS, and initialized it to point-to LDS - // within all relevant kernels. Now replace all the uses of LDS within - // non-kernel functions by LDS pointer. - replaceLDSUseByPointer(GV, LDSPointer); - - return true; -} - -namespace AMDGPU { - -// An helper class for collecting all reachable callees for each kernel defined -// within the module. -class CollectReachableCallees { - Module &M; - CallGraph CG; - SmallPtrSet AddressTakenFunctions; - - // Collect all address taken functions within the module. - void collectAddressTakenFunctions() { - auto *ECNode = CG.getExternalCallingNode(); - - for (const auto &GI : *ECNode) { - auto *CGN = GI.second; - auto *F = CGN->getFunction(); - if (!F || F->isDeclaration() || llvm::AMDGPU::isKernelCC(F)) - continue; - AddressTakenFunctions.insert(CGN); - } - } - - // For given kernel, collect all its reachable non-kernel functions. - SmallPtrSet collectReachableCallees(Function *K) { - SmallPtrSet ReachableCallees; - - // Call graph node which represents this kernel. - auto *KCGN = CG[K]; - - // Go through all call graph nodes reachable from the node representing this - // kernel, visit all their call sites, if the call site is direct, add - // corresponding callee to reachable callee set, if it is indirect, resolve - // the indirect call site to potential reachable callees, add them to - // reachable callee set, and repeat the process for the newly added - // potential callee nodes. - // - // FIXME: Need to handle bit-casted function pointers. - // - SmallVector CGNStack(depth_first(KCGN)); - SmallPtrSet VisitedCGNodes; - while (!CGNStack.empty()) { - auto *CGN = CGNStack.pop_back_val(); - - if (!VisitedCGNodes.insert(CGN).second) - continue; - - // Ignore call graph node which does not have associated function or - // associated function is not a definition. - if (!CGN->getFunction() || CGN->getFunction()->isDeclaration()) - continue; - - for (const auto &GI : *CGN) { - auto *RCB = cast(*GI.first); - auto *RCGN = GI.second; - - if (auto *DCallee = RCGN->getFunction()) { - ReachableCallees.insert(DCallee); - } else if (RCB->isIndirectCall()) { - auto *RCBFTy = RCB->getFunctionType(); - for (auto *ACGN : AddressTakenFunctions) { - auto *ACallee = ACGN->getFunction(); - if (ACallee->getFunctionType() == RCBFTy) { - ReachableCallees.insert(ACallee); - CGNStack.append(df_begin(ACGN), df_end(ACGN)); - } - } - } - } - } - - return ReachableCallees; - } - -public: - explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) { - // Collect address taken functions. - collectAddressTakenFunctions(); - } - - void collectReachableCallees( - DenseMap> &KernelToCallees) { - // Collect reachable callee set for each kernel defined in the module. - for (Function &F : M.functions()) { - if (!llvm::AMDGPU::isKernelCC(&F)) - continue; - Function *K = &F; - KernelToCallees[K] = collectReachableCallees(K); - } - } -}; - -/// Collect reachable callees for each kernel defined in the module \p M and -/// return collected callees at \p KernelToCallees. -void collectReachableCallees( - Module &M, - DenseMap> &KernelToCallees) { - CollectReachableCallees CRC{M}; - CRC.collectReachableCallees(KernelToCallees); -} - -/// For the given LDS global \p GV, visit all its users and collect all -/// non-kernel functions within which \p GV is used and return collected list of -/// such non-kernel functions. -SmallPtrSet collectNonKernelAccessorsOfLDS(GlobalVariable *GV) { - SmallPtrSet LDSAccessors; - SmallVector UserStack(GV->users()); - SmallPtrSet VisitedUsers; - - while (!UserStack.empty()) { - auto *U = UserStack.pop_back_val(); - - // `U` is already visited? continue to next one. - if (!VisitedUsers.insert(U).second) - continue; - - // `U` is a global variable which is initialized with LDS. Ignore LDS. - if (isa(U)) - return SmallPtrSet(); - - // Recursively explore constant users. - if (isa(U)) { - append_range(UserStack, U->users()); - continue; - } - - // `U` should be an instruction, if it belongs to a non-kernel function F, - // then collect F. - Function *F = cast(U)->getFunction(); - if (!llvm::AMDGPU::isKernelCC(F)) - LDSAccessors.insert(F); - } - - return LDSAccessors; -} - -DenseMap> -getFunctionToInstsMap(User *U, bool CollectKernelInsts) { - DenseMap> FunctionToInsts; - SmallVector UserStack; - SmallPtrSet VisitedUsers; - - UserStack.push_back(U); - - while (!UserStack.empty()) { - auto *UU = UserStack.pop_back_val(); - - if (!VisitedUsers.insert(UU).second) - continue; - - if (isa(UU)) - continue; - - if (isa(UU)) { - append_range(UserStack, UU->users()); - continue; - } - - auto *I = cast(UU); - Function *F = I->getFunction(); - if (CollectKernelInsts) { - if (!llvm::AMDGPU::isKernelCC(F)) { - continue; - } - } else { - if (llvm::AMDGPU::isKernelCC(F)) { - continue; - } - } - - FunctionToInsts.insert(std::pair(F, SmallPtrSet())); - FunctionToInsts[F].insert(I); - } - - return FunctionToInsts; -} - -} // namespace AMDGPU - -// Entry-point function which interface ReplaceLDSUseImpl with outside of the -// class. -bool ReplaceLDSUseImpl::replaceLDSUse() { - // Collect LDS which requires their uses to be replaced by pointer. - std::vector LDSGlobals = - collectLDSRequiringPointerReplace(); - - // No LDS to pointer-replace. Nothing to do. - if (LDSGlobals.empty()) - return false; - - // Collect reachable callee set for each kernel defined in the module. - AMDGPU::collectReachableCallees(M, KernelToCallees); - - if (KernelToCallees.empty()) { - // Either module does not have any kernel definitions, or none of the kernel - // has a call to non-kernel functions, or we could not resolve any of the - // call sites to proper non-kernel functions, because of the situations like - // inline asm calls. Nothing to replace. - return false; - } - - // For every LDS from collected LDS globals set, replace its non-kernel - // function scope use by pointer. - bool Changed = false; - for (auto *GV : LDSGlobals) - Changed |= replaceLDSUse(GV); - - return Changed; -} - -class AMDGPUReplaceLDSUseWithPointer : public ModulePass { -public: - static char ID; - - AMDGPUReplaceLDSUseWithPointer() : ModulePass(ID) { - initializeAMDGPUReplaceLDSUseWithPointerPass( - *PassRegistry::getPassRegistry()); - } - - bool runOnModule(Module &M) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - } -}; - -} // namespace - -char AMDGPUReplaceLDSUseWithPointer::ID = 0; -char &llvm::AMDGPUReplaceLDSUseWithPointerID = - AMDGPUReplaceLDSUseWithPointer::ID; - -INITIALIZE_PASS_BEGIN( - AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE, - "Replace within non-kernel function use of LDS with pointer", - false /*only look at the cfg*/, false /*analysis pass*/) -INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_END( - AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE, - "Replace within non-kernel function use of LDS with pointer", - false /*only look at the cfg*/, false /*analysis pass*/) - -bool AMDGPUReplaceLDSUseWithPointer::runOnModule(Module &M) { - ReplaceLDSUseImpl LDSUseReplacer{M}; - return LDSUseReplacer.replaceLDSUse(); -} - -ModulePass *llvm::createAMDGPUReplaceLDSUseWithPointerPass() { - return new AMDGPUReplaceLDSUseWithPointer(); -} - -PreservedAnalyses -AMDGPUReplaceLDSUseWithPointerPass::run(Module &M, ModuleAnalysisManager &AM) { - ReplaceLDSUseImpl LDSUseReplacer{M}; - LDSUseReplacer.replaceLDSUse(); - return PreservedAnalyses::all(); -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -316,11 +316,6 @@ cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden); -static cl::opt EnableLDSReplaceWithPointer( - "amdgpu-enable-lds-replace-with-pointer", - cl::desc("Enable LDS replace with pointer pass"), cl::init(false), - cl::Hidden); - static cl::opt EnableLowerModuleLDS( "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), @@ -388,7 +383,6 @@ initializeAMDGPUPropagateAttributesEarlyPass(*PR); initializeAMDGPUPropagateAttributesLatePass(*PR); initializeAMDGPURemoveIncompatibleFunctionsPass(*PR); - initializeAMDGPUReplaceLDSUseWithPointerPass(*PR); initializeAMDGPULowerModuleLDSPass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPURewriteUndefForPHIPass(*PR); @@ -611,10 +605,6 @@ PM.addPass(AMDGPUAlwaysInlinePass()); return true; } - if (PassName == "amdgpu-replace-lds-use-with-pointer") { - PM.addPass(AMDGPUReplaceLDSUseWithPointerPass()); - return true; - } if (PassName == "amdgpu-lower-module-lds") { PM.addPass(AMDGPULowerModuleLDSPass()); return true; @@ -988,14 +978,8 @@ // Replace OpenCL enqueued block function pointers with global variables. addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); - // Can increase LDS used by kernel so runs before PromoteAlloca + // Runs before PromoteAlloca so the latter can account for function uses if (EnableLowerModuleLDS) { - // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the - // pass "amdgpu-lower-module-lds", and also it required to be run only if - // "amdgpu-lower-module-lds" pass is enabled. - if (EnableLDSReplaceWithPointer) - addPass(createAMDGPUReplaceLDSUseWithPointerPass()); - addPass(createAMDGPULowerModuleLDSPass()); } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -91,7 +91,6 @@ AMDGPURegisterBankInfo.cpp AMDGPUReleaseVGPRs.cpp AMDGPURemoveIncompatibleFunctions.cpp - AMDGPUReplaceLDSUseWithPointer.cpp AMDGPUResourceUsageAnalysis.cpp AMDGPURewriteOutArguments.cpp AMDGPURewriteUndefForPHI.cpp diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h @@ -30,8 +30,6 @@ bool isDynamicLDS(const GlobalVariable &GV); bool isLDSVariableToLower(const GlobalVariable &GV); -std::vector findLDSVariablesToLower(Module &M, - const Function *F); /// Given a \p Def clobbering a load from \p Ptr according to the MSSA check /// if this is actually a memory update or an artificial clobber to facilitate diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp @@ -31,50 +31,6 @@ GV->getValueType()); } -static bool shouldLowerLDSToStruct(const GlobalVariable &GV, - const Function *F) { - // We are not interested in kernel LDS lowering for module LDS itself. - if (F && GV.getName() == "llvm.amdgcn.module.lds") - return false; - - bool Ret = false; - SmallPtrSet Visited; - SmallVector Stack(GV.users()); - - assert(!F || isKernelCC(F)); - - while (!Stack.empty()) { - const User *V = Stack.pop_back_val(); - Visited.insert(V); - - if (isa(V)) { - // This use of the LDS variable is the initializer of a global variable. - // This is ill formed. The address of an LDS variable is kernel dependent - // and unknown until runtime. It can't be written to a global variable. - continue; - } - - if (auto *I = dyn_cast(V)) { - const Function *UF = I->getFunction(); - if (UF == F) { - // Used from this kernel, we want to put it into the structure. - Ret = true; - } else if (!F) { - // For module LDS lowering, lowering is required if the user instruction - // is from non-kernel function. - Ret |= !isKernelCC(UF); - } - continue; - } - - // User V should be a constant, recursively visit users of V. - assert(isa(V) && "Expected a constant."); - append_range(Stack, V->users()); - } - - return Ret; -} - bool isDynamicLDS(const GlobalVariable &GV) { // external zero size addrspace(3) without initializer implies cuda/hip extern // __shared__ the semantics for such a variable appears to be that all extern @@ -109,21 +65,6 @@ return true; } -std::vector findLDSVariablesToLower(Module &M, - const Function *F) { - std::vector LocalVars; - for (auto &GV : M.globals()) { - if (!isLDSVariableToLower(GV)) { - continue; - } - if (!shouldLowerLDSToStruct(GV, F)) { - continue; - } - LocalVars.push_back(&GV); - } - return LocalVars; -} - bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { Instruction *DefInst = Def->getMemoryInst(); diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-diamond-shape.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-diamond-shape.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-diamond-shape.ll +++ /dev/null @@ -1,87 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s - -; DESCRIPTION: -; -; The lds global @lds_used_within_func is used within non-kernel function @func_uses_lds -; which is recheable from kernel @kernel_reaches_lds, hence pointer replacement takes place -; for @lds_used_within_func. -; - -; Original LDS should exist. -; CHECK: @lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4 -@lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4 - -; Pointer should be created. -; CHECK: @lds_used_within_func.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 - -; Pointer replacement code should be added. -define internal void @func_uses_lds() { -; CHECK-LABEL: entry: -; CHECK: %0 = load i16, ptr addrspace(3) @lds_used_within_func.ptr, align 2 -; CHECK: %1 = getelementptr i8, ptr addrspace(3) null, i16 %0 -; CHECK: %gep = getelementptr inbounds [4 x i32], ptr addrspace(3) %1, i32 0, i32 0 -; CHECK: ret void -entry: - %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_func, i32 0, i32 0 - ret void -} - -; No change -define internal void @func_does_not_use_lds_3() { -; CHECK-LABEL: entry: -; CHECK: call void @func_uses_lds() -; CHECK: ret void -entry: - call void @func_uses_lds() - ret void -} - -; No change -define internal void @func_does_not_use_lds_2() { -; CHECK-LABEL: entry: -; CHECK: call void @func_uses_lds() -; CHECK: ret void -entry: - call void @func_uses_lds() - ret void -} - -; No change -define internal void @func_does_not_use_lds_1() { -; CHECK-LABEL: entry: -; CHECK: call void @func_does_not_use_lds_2() -; CHECK: call void @func_does_not_use_lds_3() -; CHECK: ret void -entry: - call void @func_does_not_use_lds_2() - call void @func_does_not_use_lds_3() - ret void -} - -; Pointer initialization code shoud be added -define protected amdgpu_kernel void @kernel_reaches_lds() { -; CHECK-LABEL: entry: -; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK: %1 = icmp eq i32 %0, 0 -; CHECK: br i1 %1, label %2, label %3 -; -; CHECK-LABEL: 2: -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds_used_within_func to i16), ptr addrspace(3) @lds_used_within_func.ptr, align 2 -; CHECK: br label %3 -; -; CHECK-LABEL: 3: -; CHECK: call void @llvm.amdgcn.wave.barrier() -; CHECK: call void @func_does_not_use_lds_1() -; CHECK: ret void -entry: - call void @func_does_not_use_lds_1() - ret void -} - -; No change here since this kernel does not reach @func_uses_lds which uses lds. -define protected amdgpu_kernel void @kernel_does_not_reach_lds() { -; CHECK-LABEL: entry: -; CHECK: ret void -entry: - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-selected_functions.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-selected_functions.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-selected_functions.ll +++ /dev/null @@ -1,127 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s - -; DESCRIPTION: -; -; There are three lds globals defined here, and these three lds are used respectively within -; three non-kernel functions. There are three kernels, which call two of the non-kernel functions. -; Hence pointer replacement should take place for all three lds, and pointer initialization within -; kernel should selectively happen depending on which lds is reachable from the kernel. -; - -; Original LDS should exist. -; CHECK: @lds_used_within_function_1 = internal addrspace(3) global [1 x i32] undef, align 4 -; CHECK: @lds_used_within_function_2 = internal addrspace(3) global [2 x i32] undef, align 4 -; CHECK: @lds_used_within_function_3 = internal addrspace(3) global [3 x i32] undef, align 4 -@lds_used_within_function_1 = internal addrspace(3) global [1 x i32] undef, align 4 -@lds_used_within_function_2 = internal addrspace(3) global [2 x i32] undef, align 4 -@lds_used_within_function_3 = internal addrspace(3) global [3 x i32] undef, align 4 - -; Pointers should be created. -; CHECK: @lds_used_within_function_1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 -; CHECK: @lds_used_within_function_2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 -; CHECK: @lds_used_within_function_3.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 - -; Pointer replacement code should be added. -define internal void @function_3() { -; CHECK-LABEL: entry: -; CHECK: %0 = load i16, ptr addrspace(3) @lds_used_within_function_3.ptr, align 2 -; CHECK: %1 = getelementptr i8, ptr addrspace(3) null, i16 %0 -; CHECK: %gep = getelementptr inbounds [3 x i32], ptr addrspace(3) %1, i32 0, i32 0 -; CHECK: ret void -entry: - %gep = getelementptr inbounds [3 x i32], ptr addrspace(3) @lds_used_within_function_3, i32 0, i32 0 - ret void -} - -; Pointer replacement code should be added. -define internal void @function_2() { -; CHECK-LABEL: entry: -; CHECK: %0 = load i16, ptr addrspace(3) @lds_used_within_function_2.ptr, align 2 -; CHECK: %1 = getelementptr i8, ptr addrspace(3) null, i16 %0 -; CHECK: %gep = getelementptr inbounds [2 x i32], ptr addrspace(3) %1, i32 0, i32 0 -; CHECK: ret void -entry: - %gep = getelementptr inbounds [2 x i32], ptr addrspace(3) @lds_used_within_function_2, i32 0, i32 0 - ret void -} - -; Pointer replacement code should be added. -define internal void @function_1() { -; CHECK-LABEL: entry: -; CHECK: %0 = load i16, ptr addrspace(3) @lds_used_within_function_1.ptr, align 2 -; CHECK: %1 = getelementptr i8, ptr addrspace(3) null, i16 %0 -; CHECK: %gep = getelementptr inbounds [1 x i32], ptr addrspace(3) %1, i32 0, i32 0 -; CHECK: ret void -entry: - %gep = getelementptr inbounds [1 x i32], ptr addrspace(3) @lds_used_within_function_1, i32 0, i32 0 - ret void -} - -; Pointer initialization code shoud be added -define protected amdgpu_kernel void @kernel_calls_function_3_and_1() { -; CHECK-LABEL: entry: -; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK: %1 = icmp eq i32 %0, 0 -; CHECK: br i1 %1, label %2, label %3 -; -; CHECK-LABEL: 2: -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds_used_within_function_3 to i16), ptr addrspace(3) @lds_used_within_function_3.ptr, align 2 -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds_used_within_function_1 to i16), ptr addrspace(3) @lds_used_within_function_1.ptr, align 2 -; CHECK: br label %3 -; -; CHECK-LABEL: 3: -; CHECK: call void @llvm.amdgcn.wave.barrier() -; CHECK: call void @function_3() -; CHECK: call void @function_1() -; CHECK: ret void -entry: - call void @function_3() - call void @function_1() - ret void -} - -; Pointer initialization code shoud be added -define protected amdgpu_kernel void @kernel_calls_function_2_and_3() { -; CHECK-LABEL: entry: -; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK: %1 = icmp eq i32 %0, 0 -; CHECK: br i1 %1, label %2, label %3 -; -; CHECK-LABEL: 2: -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds_used_within_function_3 to i16), ptr addrspace(3) @lds_used_within_function_3.ptr, align 2 -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds_used_within_function_2 to i16), ptr addrspace(3) @lds_used_within_function_2.ptr, align 2 -; CHECK: br label %3 -; -; CHECK-LABEL: 3: -; CHECK: call void @llvm.amdgcn.wave.barrier() -; CHECK: call void @function_2() -; CHECK: call void @function_3() -; CHECK: ret void -entry: - call void @function_2() - call void @function_3() - ret void -} - -; Pointer initialization code shoud be added -define protected amdgpu_kernel void @kernel_calls_function_1_and_2() { -; CHECK-LABEL: entry: -; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK: %1 = icmp eq i32 %0, 0 -; CHECK: br i1 %1, label %2, label %3 -; -; CHECK-LABEL: 2: -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds_used_within_function_2 to i16), ptr addrspace(3) @lds_used_within_function_2.ptr, align 2 -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds_used_within_function_1 to i16), ptr addrspace(3) @lds_used_within_function_1.ptr, align 2 -; CHECK: br label %3 -; -; CHECK-LABEL: 3: -; CHECK: call void @llvm.amdgcn.wave.barrier() -; CHECK: call void @function_1() -; CHECK: call void @function_2() -; CHECK: ret void -entry: - call void @function_1() - call void @function_2() - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-to-declare-only-func.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-to-declare-only-func.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-to-declare-only-func.ll +++ /dev/null @@ -1,38 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s - -; DESCRIPTION: -; -; The kernel 'kern' makes a call to declared only function `foo`, hence `foo` -; is not considered as reachable callee, and is ignored. The function `goo` -; which uses LDS is not called from kernel 'kern', hence it is also ignored. -; - -; Original LDS should exist. -; CHECK: @lds = internal local_unnamed_addr addrspace(3) global i32 undef, align 4 -@lds = internal local_unnamed_addr addrspace(3) global i32 undef, align 4 - -; Pointer should not be created. -; CHECK-NOT: @lds.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 - -; CHECK: declare i32 @foo() -declare i32 @foo() - -; No change -define internal void @goo() { -; CHECK-LABEL: entry: -; CHECK: store i32 undef, ptr addrspace(3) @lds, align 4 -; CHECK: ret void -entry: - store i32 undef, ptr addrspace(3) @lds, align 4 - ret void -} - -; No change -define weak amdgpu_kernel void @kern() { -; CHECK-LABEL: entry: -; CHECK-LABEL: %nt = call i32 @foo() -; CHECK-LABEL: ret void -entry: - %nt = call i32 @foo() - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll +++ /dev/null @@ -1,49 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s - -; DESCRIPTION: -; -; None of lds are pointer-replaced since they are all used in global scope in one or the other way. -; - -; CHECK: @lds = internal addrspace(3) global [4 x i32] undef, align 4 -; CHECK: @lds.1 = addrspace(3) global i16 undef, align 2 -; CHECK: @lds.2 = addrspace(3) global i32 undef, align 4 -; CHECK: @lds.3 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1 -@lds = internal addrspace(3) global [4 x i32] undef, align 4 -@lds.1 = addrspace(3) global i16 undef, align 2 -@lds.2 = addrspace(3) global i32 undef, align 4 -@lds.3 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1 - -; CHECK: @global_var = addrspace(1) global ptr addrspacecast (ptr addrspace(3) @lds to ptr), align 8 -; CHECK: @llvm.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @lds.1 to ptr)], section "llvm.metadata" -; CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @lds.2 to ptr)], section "llvm.metadata" -; CHECK: @alias.to.lds.3 = alias [1 x i8], ptr addrspace(3) @lds.3 -@global_var = addrspace(1) global ptr addrspacecast (ptr addrspace(3) @lds to ptr), align 8 -@llvm.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @lds.1 to ptr)], section "llvm.metadata" -@llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @lds.2 to ptr)], section "llvm.metadata" -@alias.to.lds.3 = alias [1 x i8], ptr addrspace(3) @lds.3 - -; CHECK-NOT: @lds.ptr -; CHECK-NOT: @lds.1.ptr -; CHECK-NOT: @lds.2.ptr -; CHECK-NOT: @lds.3.ptr - -define void @f0() { -; CHECK-LABEL: entry: -; CHECK: %ld1 = load i16, ptr addrspace(3) @lds.1 -; CHECK: %ld2 = load i32, ptr addrspace(3) @lds.2 -; CHECK: ret void -entry: - %ld1 = load i16, ptr addrspace(3) @lds.1 - %ld2 = load i32, ptr addrspace(3) @lds.2 - ret void -} - -define protected amdgpu_kernel void @k0() { -; CHECK-LABEL: entry: -; CHECK: call void @f0() -; CHECK: ret void -entry: - call void @f0() - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s - -; DESCRIPTION: -; -; We do not know what to do with inline asm call, we ignore it, hence pointer replacement for -; @used_only_within_func does not take place. -; - -; CHECK: @used_only_within_func = addrspace(3) global [4 x i32] undef, align 4 -@used_only_within_func = addrspace(3) global [4 x i32] undef, align 4 - -; CHECK-NOT: @used_only_within_func.ptr - -define void @f0(i32 %x) { -; CHECK-LABEL: entry: -; CHECK: store i32 %x, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_func to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_func to ptr) to i64)) to ptr), align 4 -; CHECK: ret void -entry: - store i32 %x, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_func to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_func to ptr) to i64)) to ptr), align 4 - ret void -} - -define amdgpu_kernel void @k0() { -; CHECK-LABEL: entry: -; CHECK: call i32 asm "s_mov_b32 $0, 0", "=s"() -; CHECK: ret void -entry: - call i32 asm "s_mov_b32 $0, 0", "=s"() - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s - -; DESCRIPTION ; -; -; LDS global @used_only_within_kern is used only within kernel @k0, hence pointer replacement -; does not take place for @used_only_within_kern. -; - -; CHECK: @used_only_within_kern = addrspace(3) global [4 x i32] undef, align 4 -@used_only_within_kern = addrspace(3) global [4 x i32] undef, align 4 - -; CHECK-NOT: @used_only_within_kern.ptr - -define amdgpu_kernel void @k0() { -; CHECK-LABEL: entry: -; CHECK: %ld = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_kern to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_kern to ptr) to i64)) to ptr), align 4 -; CHECK: %mul = mul i32 %ld, 2 -; CHECK: store i32 %mul, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_kern to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_kern to ptr) to i64)) to ptr), align 4 -; CHECK: ret void -entry: - %ld = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_kern to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_kern to ptr) to i64)) to ptr), align 4 - %mul = mul i32 %ld, 2 - store i32 %mul, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_kern to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_kern to ptr) to i64)) to ptr), align 4 - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s - -; DESCRIPTION ; -; -; LDS global @not-reachable-lds is used within non-kernel function @f0, but @f0 is *not* -; reachable from kernel @k, hence pointer replacement does not take place. -; - -; CHECK: @not-reachable-lds = internal addrspace(3) global [4 x i32] undef, align 4 -@not-reachable-lds = internal addrspace(3) global [4 x i32] undef, align 4 - -; CHECK-NOT: @not-reachable-lds.ptr - -define internal void @f0() { -; CHECK-LABEL: entry: -; CHECK: ret void -entry: - ret void -} - -define protected amdgpu_kernel void @k0() { -; CHECK-LABEL: entry: -; CHECK: ret void -entry: - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll +++ /dev/null @@ -1,31 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s - -; DESCRIPTION ; -; -; LDS global @small_lds is used within non-kernel function @f0, and @f0 is reachable -; from kernel @k0, but since @small_lds too small for pointer replacement, pointer -; replacement does not take place. -; - -; CHECK: @small_lds = addrspace(3) global i8 undef, align 1 -@small_lds = addrspace(3) global i8 undef, align 1 - -; CHECK-NOT: @small_lds.ptr - -define void @f0() { -; CHECK-LABEL: entry: -; CHECK: store i8 1, ptr addrspace(3) @small_lds, align 1 -; CHECK: ret void -entry: - store i8 1, ptr addrspace(3) @small_lds, align 1 - ret void -} - -define amdgpu_kernel void @k0() { -; CHECK-LABEL: entry: -; CHECK: call void @f0() -; CHECK: ret void -entry: - call void @f0() - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-diamond-shape.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-diamond-shape.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-diamond-shape.ll +++ /dev/null @@ -1,94 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s - -; DESCRIPTION: -; -; The lds global @lds_used_within_func is used within non-kernel function @func_uses_lds -; which is *indirectly* recheable from kernel @kernel_reaches_lds, hence pointer replacement -; takes place for @lds_used_within_func. - -; Original LDS should exit. -; CHECK: @lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4 -@lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4 - -; Function pointer should exist as it is. -; CHECK: @ptr_to_func = internal local_unnamed_addr externally_initialized global ptr @func_uses_lds, align 8 -@ptr_to_func = internal local_unnamed_addr externally_initialized global ptr @func_uses_lds, align 8 - -; Pointer should be created. -; CHECK: @lds_used_within_func.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 - -; Pointer replacement code should be added. -define internal void @func_uses_lds() { -; CHECK-LABEL: entry: -; CHECK: %0 = load i16, ptr addrspace(3) @lds_used_within_func.ptr, align 2 -; CHECK: %1 = getelementptr i8, ptr addrspace(3) null, i16 %0 -; CHECK: %gep = getelementptr inbounds [4 x i32], ptr addrspace(3) %1, i32 0, i32 0 -; CHECK: ret void -entry: - %gep = getelementptr inbounds [4 x i32], ptr addrspace(3) @lds_used_within_func, i32 0, i32 0 - ret void -} - -; No change -define internal void @func_does_not_use_lds_3() { -; CHECK-LABEL: entry: -; CHECK: %fptr = load ptr, ptr @ptr_to_func, align 8 -; CHECK: call void %fptr() -; CHECK: ret void -entry: - %fptr = load ptr, ptr @ptr_to_func, align 8 - call void %fptr() - ret void -} - -; No change -define internal void @func_does_not_use_lds_2() { -; CHECK-LABEL: entry: -; CHECK: %fptr = load ptr, ptr @ptr_to_func, align 8 -; CHECK: call void %fptr() -; CHECK: ret void -entry: - %fptr = load ptr, ptr @ptr_to_func, align 8 - call void %fptr() - ret void -} - -; No change -define internal void @func_does_not_use_lds_1() { -; CHECK-LABEL: entry: -; CHECK: call void @func_does_not_use_lds_2() -; CHECK: call void @func_does_not_use_lds_3() -; CHECK: ret void -entry: - call void @func_does_not_use_lds_2() - call void @func_does_not_use_lds_3() - ret void -} - -; Pointer initialization code shoud be added -define protected amdgpu_kernel void @kernel_reaches_lds() { -; CHECK-LABEL: entry: -; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK: %1 = icmp eq i32 %0, 0 -; CHECK: br i1 %1, label %2, label %3 -; -; CHECK-LABEL: 2: -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds_used_within_func to i16), ptr addrspace(3) @lds_used_within_func.ptr, align 2 -; CHECK: br label %3 -; -; CHECK-LABEL: 3: -; CHECK: call void @llvm.amdgcn.wave.barrier() -; CHECK: call void @func_does_not_use_lds_1() -; CHECK: ret void -entry: - call void @func_does_not_use_lds_1() - ret void -} - -; No change here since this kernel does not reach @func_uses_lds which uses lds. -define protected amdgpu_kernel void @kernel_does_not_reach_lds() { -; CHECK-LABEL: entry: -; CHECK: ret void -entry: - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-selected_functions.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-selected_functions.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-selected_functions.ll +++ /dev/null @@ -1,148 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s - -; DESCRIPTION: -; -; There are three lds globals defined here, and these three lds are used respectively within -; three non-kernel functions. There are three kernels, which *indirectly* call two of the -; non-kernel functions. Hence pointer replacement should take place for all three lds, and -; pointer initialization within kernel should selectively happen depending on which lds is -; reachable from the kernel. -; - -; Original LDS should exist. -; CHECK: @lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4 -; CHECK: @lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4 -; CHECK: @lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4 -@lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4 -@lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4 -@lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4 - -; Function pointers should exist. -; CHECK: @ptr_to_func1 = internal local_unnamed_addr externally_initialized global ptr @function_1, align 8 -; CHECK: @ptr_to_func2 = internal local_unnamed_addr externally_initialized global ptr @function_2, align 8 -; CHECK: @ptr_to_func3 = internal local_unnamed_addr externally_initialized global ptr @function_3, align 8 -@ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (float)* @function_1, align 8 -@ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8 -@ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i8)* @function_3, align 8 - -; Pointers should be created. -; CHECK: @lds_used_within_function_1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 -; CHECK: @lds_used_within_function_2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 -; CHECK: @lds_used_within_function_3.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 - -; Pointer replacement code should be added. -define internal void @function_3(i8 %c) { -; CHECK-LABEL: entry: -; CHECK: %0 = load i16, ptr addrspace(3) @lds_used_within_function_3.ptr, align 2 -; CHECK: %1 = getelementptr i8, ptr addrspace(3) null, i16 %0 -; CHECK: %gep = getelementptr inbounds [4 x i32], ptr addrspace(3) %1, i32 0, i32 0 -; CHECK: ret void -entry: - %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_3, i32 0, i32 0 - ret void -} - -; Pointer replacement code should be added. -define internal void @function_2(i16 %i) { -; CHECK-LABEL: entry: -; CHECK: %0 = load i16, ptr addrspace(3) @lds_used_within_function_2.ptr, align 2 -; CHECK: %1 = getelementptr i8, ptr addrspace(3) null, i16 %0 -; CHECK: %gep = getelementptr inbounds [4 x i32], ptr addrspace(3) %1, i32 0, i32 0 -; CHECK: ret void -entry: - %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_2, i32 0, i32 0 - ret void -} - -; Pointer replacement code should be added. -define internal void @function_1(float %f) { -; CHECK-LABEL: entry: -; CHECK: %0 = load i16, ptr addrspace(3) @lds_used_within_function_1.ptr, align 2 -; CHECK: %1 = getelementptr i8, ptr addrspace(3) null, i16 %0 -; CHECK: %gep = getelementptr inbounds [4 x i32], ptr addrspace(3) %1, i32 0, i32 0 -; CHECK: ret void -entry: - %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_1, i32 0, i32 0 - ret void -} - -; Pointer initialization code shoud be added -define protected amdgpu_kernel void @kernel_calls_function_3_and_1() { -; CHECK-LABEL: entry: -; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK: %1 = icmp eq i32 %0, 0 -; CHECK: br i1 %1, label %2, label %3 -; -; CHECK-LABEL: 2: -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds_used_within_function_3 to i16), ptr addrspace(3) @lds_used_within_function_3.ptr, align 2 -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds_used_within_function_1 to i16), ptr addrspace(3) @lds_used_within_function_1.ptr, align 2 -; CHECK: br label %3 -; -; CHECK-LABEL: 3: -; CHECK: call void @llvm.amdgcn.wave.barrier() -; CHECK: %fptr3 = load ptr, ptr @ptr_to_func3, align 8 -; CHECK: %fptr1 = load ptr, ptr @ptr_to_func1, align 8 -; CHECK: call void %fptr3(i8 1) -; CHECK: call void %fptr1(float 2.000000e+00) -; CHECK: ret void -entry: - %fptr3 = load ptr, ptr @ptr_to_func3, align 8 - %fptr1 = load ptr, ptr @ptr_to_func1, align 8 - call void %fptr3(i8 1) - call void %fptr1(float 2.0) - ret void -} - -; Pointer initialization code shoud be added -define protected amdgpu_kernel void @kernel_calls_function_2_and_3() { -; CHECK-LABEL: entry: -; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK: %1 = icmp eq i32 %0, 0 -; CHECK: br i1 %1, label %2, label %3 -; -; CHECK-LABEL: 2: -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds_used_within_function_3 to i16), ptr addrspace(3) @lds_used_within_function_3.ptr, align 2 -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds_used_within_function_2 to i16), ptr addrspace(3) @lds_used_within_function_2.ptr, align 2 -; CHECK: br label %3 -; -; CHECK-LABEL: 3: -; CHECK: call void @llvm.amdgcn.wave.barrier() -; CHECK: %fptr2 = load ptr, ptr @ptr_to_func2, align 8 -; CHECK: %fptr3 = load ptr, ptr @ptr_to_func3, align 8 -; CHECK: call void %fptr2(i16 3) -; CHECK: call void %fptr3(i8 4) -; CHECK: ret void -entry: - %fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8 - %fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8 - call void %fptr2(i16 3) - call void %fptr3(i8 4) - ret void -} - -; Pointer initialization code shoud be added -define protected amdgpu_kernel void @kernel_calls_function_1_and_2() { -; CHECK-LABEL: entry: -; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK: %1 = icmp eq i32 %0, 0 -; CHECK: br i1 %1, label %2, label %3 -; -; CHECK-LABEL: 2: -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds_used_within_function_2 to i16), ptr addrspace(3) @lds_used_within_function_2.ptr, align 2 -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds_used_within_function_1 to i16), ptr addrspace(3) @lds_used_within_function_1.ptr, align 2 -; CHECK: br label %3 -; -; CHECK-LABEL: 3: -; CHECK: call void @llvm.amdgcn.wave.barrier() -; CHECK: %fptr1 = load ptr, ptr @ptr_to_func1, align 8 -; CHECK: %fptr2 = load ptr, ptr @ptr_to_func2, align 8 -; CHECK: call void %fptr1(float 5.000000e+00) -; CHECK: call void %fptr2(i16 6) -; CHECK: ret void -entry: - %fptr1 = load ptr, ptr @ptr_to_func1, align 8 - %fptr2 = load ptr, ptr @ptr_to_func2, align 8 - call void %fptr1(float 5.0) - call void %fptr2(i16 6) - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-signature-match.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-signature-match.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-signature-match.ll +++ /dev/null @@ -1,91 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s - -; DESCRIPTION: -; -; There are three lds globals defined here, and these three lds are used respectively within -; three non-kernel functions. There is one kernel which *indirectly* calls one of the non-kernel -; functions. But since all the three non-kernel functions have same signature, all three -; non-kernel functions are resolved as potential callees for indirect call-site. Hence we land-up -; pointer replacement for three lds globals. -; - -; Original LDS should exist. -; CHECK: @lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4 -; CHECK: @lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4 -; CHECK: @lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4 -@lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4 -@lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4 -@lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4 - -; Function pointers should exist. -; CHECK: @ptr_to_func1 = internal local_unnamed_addr externally_initialized global ptr @function_1, align 8 -; CHECK: @ptr_to_func2 = internal local_unnamed_addr externally_initialized global ptr @function_2, align 8 -; CHECK: @ptr_to_func3 = internal local_unnamed_addr externally_initialized global ptr @function_3, align 8 -@ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (i16)* @function_1, align 8 -@ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8 -@ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i16)* @function_3, align 8 - -; Pointers should be created. -; CHECK: @lds_used_within_function_1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 -; CHECK: @lds_used_within_function_2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 -; CHECK: @lds_used_within_function_3.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 - -; Pointer replacement code should be added. -define internal void @function_3(i16 %i) { -; CHECK-LABEL: entry: -; CHECK: %0 = load i16, ptr addrspace(3) @lds_used_within_function_3.ptr, align 2 -; CHECK: %1 = getelementptr i8, ptr addrspace(3) null, i16 %0 -; CHECK: %gep = getelementptr inbounds [4 x i32], ptr addrspace(3) %1, i32 0, i32 0 -; CHECK: ret void -entry: - %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_3, i32 0, i32 0 - ret void -} - -; Pointer replacement code should be added. -define internal void @function_2(i16 %i) { -; CHECK-LABEL: entry: -; CHECK: %0 = load i16, ptr addrspace(3) @lds_used_within_function_2.ptr, align 2 -; CHECK: %1 = getelementptr i8, ptr addrspace(3) null, i16 %0 -; CHECK: %gep = getelementptr inbounds [4 x i32], ptr addrspace(3) %1, i32 0, i32 0 -; CHECK: ret void -entry: - %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_2, i32 0, i32 0 - ret void -} - -; Pointer replacement code should be added. -define internal void @function_1(i16 %i) { -; CHECK-LABEL: entry: -; CHECK: %0 = load i16, ptr addrspace(3) @lds_used_within_function_1.ptr, align 2 -; CHECK: %1 = getelementptr i8, ptr addrspace(3) null, i16 %0 -; CHECK: %gep = getelementptr inbounds [4 x i32], ptr addrspace(3) %1, i32 0, i32 0 -; CHECK: ret void -entry: - %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_1, i32 0, i32 0 - ret void -} - -; Pointer initialization code shoud be added -define protected amdgpu_kernel void @kernel_indirectly_calls_function_1() { -; CHECK-LABEL: entry: -; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK: %1 = icmp eq i32 %0, 0 -; CHECK: br i1 %1, label %2, label %3 -; -; CHECK-LABEL: 2: -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds_used_within_function_3 to i16), ptr addrspace(3) @lds_used_within_function_3.ptr, align 2 -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds_used_within_function_2 to i16), ptr addrspace(3) @lds_used_within_function_2.ptr, align 2 -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds_used_within_function_1 to i16), ptr addrspace(3) @lds_used_within_function_1.ptr, align 2 -; CHECK: br label %3 -; -; CHECK-LABEL: 3: -; CHECK: call void @llvm.amdgcn.wave.barrier() -; CHECK: %fptr1 = load ptr, ptr @ptr_to_func1, align 8 -; CHECK: call void %fptr1(i16 6) -; CHECK: ret void -entry: - %fptr1 = load void (i16)*, void (i16)** @ptr_to_func1, align 8 - call void %fptr1(i16 6) - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-multiple-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-multiple-lds.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-multiple-lds.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s - -; DESCRIPTION: -; -; There are three lds globals defined here, and these three lds are used within a single -; non-kernel function, and this non-kernel function is reachable from kernel. Hence pointer -; replacement is required for all three lds globals. -; - -; Original LDS should exist. -; CHECK: @lds1 = internal addrspace(3) global [1 x i32] undef, align 4 -; CHECK: @lds2 = internal addrspace(3) global [2 x i32] undef, align 4 -; CHECK: @lds3 = internal addrspace(3) global [3 x i32] undef, align 4 -@lds1 = internal addrspace(3) global [1 x i32] undef, align 4 -@lds2 = internal addrspace(3) global [2 x i32] undef, align 4 -@lds3 = internal addrspace(3) global [3 x i32] undef, align 4 - -; Pointers should be created. -; CHECK: @lds1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 -; CHECK: @lds2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 -; CHECK: @lds3.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 - -; Pointer replacement code should be added. -define internal void @function() { -; CHECK-LABEL: entry: -; CHECK: %0 = load i16, ptr addrspace(3) @lds3.ptr, align 2 -; CHECK: %1 = getelementptr i8, ptr addrspace(3) null, i16 %0 -; CHECK: %2 = load i16, ptr addrspace(3) @lds2.ptr, align 2 -; CHECK: %3 = getelementptr i8, ptr addrspace(3) null, i16 %2 -; CHECK: %4 = load i16, ptr addrspace(3) @lds1.ptr, align 2 -; CHECK: %5 = getelementptr i8, ptr addrspace(3) null, i16 %4 -; CHECK: %gep1 = getelementptr inbounds [1 x i32], ptr addrspace(3) %5, i32 0, i32 0 -; CHECK: %gep2 = getelementptr inbounds [2 x i32], ptr addrspace(3) %3, i32 0, i32 0 -; CHECK: %gep3 = getelementptr inbounds [3 x i32], ptr addrspace(3) %1, i32 0, i32 0 -; CHECK: ret void -entry: - %gep1 = getelementptr inbounds [1 x i32], ptr addrspace(3) @lds1, i32 0, i32 0 - %gep2 = getelementptr inbounds [2 x i32], ptr addrspace(3) @lds2, i32 0, i32 0 - %gep3 = getelementptr inbounds [3 x i32], ptr addrspace(3) @lds3, i32 0, i32 0 - ret void -} - -; Pointer initialization code shoud be added; -define protected amdgpu_kernel void @kernel() { -; CHECK-LABEL: entry: -; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK: %1 = icmp eq i32 %0, 0 -; CHECK: br i1 %1, label %2, label %3 -; -; CHECK-LABEL: 2: -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds3 to i16), ptr addrspace(3) @lds3.ptr, align 2 -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds2 to i16), ptr addrspace(3) @lds2.ptr, align 2 -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds1 to i16), ptr addrspace(3) @lds1.ptr, align 2 -; CHECK: br label %3 -; -; CHECK-LABEL: 3: -; CHECK: call void @llvm.amdgcn.wave.barrier() -; CHECK: call void @function() -; CHECK: ret void -entry: - call void @function() - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-same-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-same-lds.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-same-lds.ll +++ /dev/null @@ -1,52 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s - -; DESCRIPTION: -; -; There is one lds global defined here, and this lds is used within a single non-kernel -; function multiple times, and this non-kernel function is reachable from kernel. Hence -; pointer takes place. But important note is - store-to/load-from pointer should happen -; only once irrespective of number of uses. -; - -; Original LDS should exist. -; CHECK: @lds1 = internal addrspace(3) global [1 x i32] undef, align 4 -@lds1 = internal addrspace(3) global [1 x i32] undef, align 4 - -; Pointers should be created. -; CHECK: @lds1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 - -; Pointer replacement code should be added. -define internal void @function() { -; CHECK-LABEL: entry: -; CHECK: %0 = load i16, ptr addrspace(3) @lds1.ptr, align 2 -; CHECK: %1 = getelementptr i8, ptr addrspace(3) null, i16 %0 -; CHECK: %gep1 = getelementptr inbounds [1 x i32], ptr addrspace(3) %1, i32 0, i32 0 -; CHECK: %gep2 = getelementptr inbounds [1 x i32], ptr addrspace(3) %1, i32 0, i32 0 -; CHECK: %gep3 = getelementptr inbounds [1 x i32], ptr addrspace(3) %1, i32 0, i32 0 -; CHECK: ret void -entry: - %gep1 = getelementptr inbounds [1 x i32], ptr addrspace(3) @lds1, i32 0, i32 0 - %gep2 = getelementptr inbounds [1 x i32], ptr addrspace(3) @lds1, i32 0, i32 0 - %gep3 = getelementptr inbounds [1 x i32], ptr addrspace(3) @lds1, i32 0, i32 0 - ret void -} - -; Pointer initialization code shoud be added; -define protected amdgpu_kernel void @kernel() { -; CHECK-LABEL: entry: -; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK: %1 = icmp eq i32 %0, 0 -; CHECK: br i1 %1, label %2, label %3 -; -; CHECK-LABEL: 2: -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds1 to i16), ptr addrspace(3) @lds1.ptr, align 2 -; CHECK: br label %3 -; -; CHECK-LABEL: 3: -; CHECK: call void @llvm.amdgcn.wave.barrier() -; CHECK: call void @function() -; CHECK: ret void -entry: - call void @function() - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll +++ /dev/null @@ -1,52 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s - -; DESCRIPTION: -; -; There is one lds global defined here, and this lds is used within a single non-kernel -; function, as an operand of nested constant expression, and this non-kernel function is -; reachable from kernel. Hence nested constant expression should to be converted into a -; series of instructons and pointer replacement should take place. -; - -; Original LDS should exist. -; CHECK: @used_only_within_func = addrspace(3) global [4 x i32] undef, align 4 -@used_only_within_func = addrspace(3) global [4 x i32] undef, align 4 - -; Pointers should be created. -; CHECK: @used_only_within_func.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 - -; Pointer replacement code should be added. -define void @f0(i32 %x) { -; CHECK-LABEL: entry: -; CHECK: %0 = load i16, ptr addrspace(3) @used_only_within_func.ptr, align 2 -; CHECK: %1 = getelementptr i8, ptr addrspace(3) null, i16 %0 -; CHECK: %2 = addrspacecast ptr addrspace(3) %1 to ptr -; CHECK: %3 = ptrtoint ptr %2 to i64 -; CHECK: %4 = add i64 %3, %3 -; CHECK: %5 = inttoptr i64 %4 to ptr -; CHECK: store i32 %x, ptr %5, align 4 -; CHECK: ret void -entry: - store i32 %x, ptr inttoptr (i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_func to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @used_only_within_func to ptr) to i64)) to ptr), align 4 - ret void -} - -; Pointer initialization code shoud be added -define amdgpu_kernel void @k0() { -; CHECK-LABEL: entry: -; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK: %1 = icmp eq i32 %0, 0 -; CHECK: br i1 %1, label %2, label %3 -; -; CHECK-LABEL: 2: -; CHECK: store i16 ptrtoint (ptr addrspace(3) @used_only_within_func to i16), ptr addrspace(3) @used_only_within_func.ptr, align 2 -; CHECK: br label %3 -; -; CHECK-LABEL: 3: -; CHECK: call void @llvm.amdgcn.wave.barrier() -; CHECK: call void @f0(i32 0) -; CHECK: ret void -entry: - call void @f0(i32 0) - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr2.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr2.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr2.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s - -; DESCRIPTION: -; There is one lds global defined here, and this lds is used within a single non-kernel -; function, as an operand of nested constant expression, and this non-kernel function is -; reachable from kernel. Hence nested constant expression should to be converted into a -; series of instructons and pointer replacement should take place. But, important note -; is - only constant expression operands which uses lds should be converted into -; instructions, other constant expression operands which do not use lds should be left -; untouched. -; - -; Original LDS should exist. -; CHECK: @lds_used_within_function = internal addrspace(3) global [4 x i32] undef, align 4 -@lds_used_within_function = internal addrspace(3) global [4 x i32] undef, align 4 - -; Non-LDS global should exist as it is. -; CHECK: @global_var = internal addrspace(1) global [4 x i32] undef, align 4 -@global_var = internal addrspace(1) global [4 x i32] undef, align 4 - -; Pointer should be created. -; CHECK: @lds_used_within_function.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 - -; Pointer replacement code should be added. -define internal void @function() { -; CHECK-LABEL: entry: -; CHECK: %0 = load i16, ptr addrspace(3) @lds_used_within_function.ptr, align 2 -; CHECK: %1 = getelementptr i8, ptr addrspace(3) null, i16 %0 -; CHECK: %2 = getelementptr inbounds [4 x i32], ptr addrspace(3) %1, i32 0, i32 2 -; CHECK: %3 = addrspacecast ptr addrspace(3) %2 to ptr -; CHECK: %4 = ptrtoint ptr %3 to i32 -; CHECK: %5 = add i32 %4, ptrtoint (ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @global_var, i32 0, i32 2) to i32) -; CHECK: ret void -entry: - %0 = add i32 ptrtoint (ptr addrspacecast (ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @lds_used_within_function, i32 0, i32 2) to ptr) to i32), ptrtoint (ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @global_var, i32 0, i32 2) to i32) - ret void -} - -; Pointer initialization code shoud be added -define protected amdgpu_kernel void @kernel() { -; CHECK-LABEL: entry: -; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK: %1 = icmp eq i32 %0, 0 -; CHECK: br i1 %1, label %2, label %3 -; -; CHECK-LABEL: 2: -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds_used_within_function to i16), ptr addrspace(3) @lds_used_within_function.ptr, align 2 -; CHECK: br label %3 -; -; CHECK-LABEL: 3: -; CHECK: call void @llvm.amdgcn.wave.barrier() -; CHECK: call void @function() -; CHECK: ret void -entry: - call void @function() - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-phi-inst.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-phi-inst.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-phi-inst.ll +++ /dev/null @@ -1,91 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s - -; DESCRIPTION: -; -; Replace lds globals used within phi instruction. -; - -; Original LDS should exist. -; CHECK: @lds.1 = addrspace(3) global i32 undef, align 4 -; CHECK: @lds.2 = addrspace(3) global i32 undef, align 4 -@lds.1 = addrspace(3) global i32 undef, align 4 -@lds.2 = addrspace(3) global i32 undef, align 4 - -; Pointers should be created. -; CHECK: @lds.1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 -; CHECK: @lds.2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 - -define void @f0(i32 %arg) { -; CHECK-LABEL: bb: -; CHECK: %0 = load i16, ptr addrspace(3) @lds.2.ptr, align 2 -; CHECK: %1 = getelementptr i8, ptr addrspace(3) null, i16 %0 -; CHECK: %2 = load i16, ptr addrspace(3) @lds.1.ptr, align 2 -; CHECK: %3 = getelementptr i8, ptr addrspace(3) null, i16 %2 -; CHECK: %id = call i32 @llvm.amdgcn.workitem.id.x() -; CHECK: %my.tmp = sub i32 %id, %arg -; CHECK: br label %bb1 -bb: - %id = call i32 @llvm.amdgcn.workitem.id.x() - %my.tmp = sub i32 %id, %arg - br label %bb1 - -; CHECK-LABEL: bb1: -; CHECK: %lsr.iv = phi i32 [ undef, %bb ], [ %my.tmp2, %Flow ] -; CHECK: %4 = icmp ne ptr addrspace(3) inttoptr (i32 4 to ptr addrspace(3)), %3 -; CHECK: %lsr.iv.next = add i32 %lsr.iv, 1 -; CHECK: %cmp0 = icmp slt i32 %lsr.iv.next, 0 -; CHECK: br i1 %cmp0, label %bb4, label %Flow -bb1: - %lsr.iv = phi i32 [ undef, %bb ], [ %my.tmp2, %Flow ] - %lsr.iv.next = add i32 %lsr.iv, 1 - %cmp0 = icmp slt i32 %lsr.iv.next, 0 - br i1 %cmp0, label %bb4, label %Flow - -; CHECK-LABEL: bb4: -; CHECK: %load = load volatile i32, ptr addrspace(1) undef, align 4 -; CHECK: %cmp1 = icmp sge i32 %my.tmp, %load -; CHECK: br label %Flow -bb4: - %load = load volatile i32, ptr addrspace(1) undef, align 4 - %cmp1 = icmp sge i32 %my.tmp, %load - br label %Flow - -; CHECK-LABEL: Flow: -; CHECK: %my.tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] -; CHECK: %my.tmp3 = phi ptr addrspace(3) [ %1, %bb4 ], [ %3, %bb1 ] -; CHECK: %my.tmp4 = phi i1 [ %cmp1, %bb4 ], [ %4, %bb1 ] -; CHECK: br i1 %my.tmp4, label %bb9, label %bb1 -Flow: - %my.tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] - %my.tmp3 = phi ptr addrspace(3) [@lds.2, %bb4 ], [ @lds.1, %bb1 ] - %my.tmp4 = phi i1 [ %cmp1, %bb4 ], [ icmp ne (ptr addrspace(3) inttoptr (i32 4 to ptr addrspace(3)), ptr addrspace(3) @lds.1), %bb1 ] - br i1 %my.tmp4, label %bb9, label %bb1 - -; CHECK-LABEL: bb9: -; CHECK: store volatile i32 7, ptr addrspace(3) undef, align 4 -; CHECK: ret void -bb9: - store volatile i32 7, ptr addrspace(3) undef - ret void -} - -; CHECK-LABEL: @k0 -; CHECK: %1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK: %2 = icmp eq i32 %1, 0 -; CHECK: br i1 %2, label %3, label %4 -; -; CHECK-LABEL: 3: -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds.2 to i16), ptr addrspace(3) @lds.2.ptr, align 2 -; CHECK: store i16 ptrtoint (ptr addrspace(3) @lds.1 to i16), ptr addrspace(3) @lds.1.ptr, align 2 -; CHECK: br label %4 -; -; CHECK-LABEL: 4: -; CHECK: call void @llvm.amdgcn.wave.barrier() -; CHECK: call void @f0(i32 %arg) -; CHECK: ret void -define amdgpu_kernel void @k0(i32 %arg) { - call void @f0(i32 %arg) - ret void -} - -declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -176,7 +176,6 @@ "AMDGPURegisterBankInfo.cpp", "AMDGPUReleaseVGPRs.cpp", "AMDGPURemoveIncompatibleFunctions.cpp", - "AMDGPUReplaceLDSUseWithPointer.cpp", "AMDGPUResourceUsageAnalysis.cpp", "AMDGPURewriteOutArguments.cpp", "AMDGPURewriteUndefForPHI.cpp",