diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -29,7 +29,9 @@ #include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDGPULDSUtils.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetOperations.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" @@ -48,6 +50,355 @@ namespace { +// ReplaceLDSUseWithPointer replaces all the uses of LDS within non-kernel +// functions by corresponding pointer counter-parts. +// +// The main motivation behind this pointer replacement phase is - to *avoid* +// subsequent LDS lowering phase from directly packing LDS (assume large LDS) +// into a struct type which would otherwise cause allocating huge memory for +// struct instance within every kernel. +// +// Brief sketch of the algorithm is as below: +// +// 1. Collect all the LDS defined in the module which qualify for pointer +// replacement, say it is, LDSGlobals set. +// +// 2. Collect all the reachable callees for each kernel defined in the module, +// say it is, KernelToCallees map. +// +// 3. FOR (each global GV from LDSGlobals set) DO +// LDSUsedNonKernels = Collect all non-kernel functions which use GV. +// FOR (each kernel K in KernelToCallees map) DO +// ReachableCallees = KernelToCallees[K] +// ReachableAndLDSUsedCallees = +// SetIntersect(LDSUsedNonKernels, ReachableCallees) +// IF (ReachableAndLDSUsedCallees is not empty) THEN +// Pointer = Create a pointer to point-to GV if not created. +// Initialize Pointer to point-to GV within kernel K. +// ENDIF +// ENDFOR +// Replace all uses of GV within non kernel functions by Pointer. +// ENFOR +// +// LLVM IR example: +// +// Input IR: +// +// @lds = internal addrspace(3) global [4 x i32] undef, align 16 +// +// define internal void @f0() { +// entry: +// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds, +// i32 0, i32 0 +// ret void +// } +// +// define protected amdgpu_kernel void @k0() { +// entry: +// call void @f0() +// ret void +// } +// +// Output IR: +// +// @lds = internal addrspace(3) global [4 x i32] undef, align 16 +// @lds.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 +// +// define internal void @f0() { +// entry: +// %0 = load i16, i16 addrspace(3)* @lds.ptr, align 2 +// %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +// %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, +// i32 0, i32 0 +// ret void +// } +// +// define protected amdgpu_kernel void @k0() { +// entry: +// store i16 ptrtoint ([4 x i32] addrspace(3)* @lds to i16), +// i16 addrspace(3)* @lds.ptr, align 2 +// call void @f0() +// ret void +// } +// +class ReplaceLDSUseWithPointer { + Module &M; + LLVMContext &Ctx; + const DataLayout &DL; + Constant *LDSMemBaseAddr; + + SmallPtrSet UsedList; + std::vector LDSGlobals; + DenseMap LDSToPointer; + DenseMap> LDSToNonKernels; + DenseMap> KernelToCallees; + DenseMap> KernelToLDSPointers; + DenseMap> + FunctionToLDSToReplaceInst; + + // Collect LDS which requires their uses to be replaced by pointer. + bool collectLDSRequiringPointerReplace() { + // Collect LDS which requires module lowering. + LDSGlobals = AMDGPU::findVariablesToLower(M, UsedList); + + // Remove LDS which don't qualify for replacement. + LDSGlobals.erase(std::remove_if(LDSGlobals.begin(), LDSGlobals.end(), + [&](GlobalVariable *GV) { + return shouldIgnorePointerReplacement(GV); + }), + LDSGlobals.end()); + + return !LDSGlobals.empty(); + } + + // Returns true if an LDS global does not require its non-kernel function uses + // to be replaced by pointer. + bool shouldIgnorePointerReplacement(GlobalVariable *GV) { + // LDS whose size is very small and doesn`t exceed pointer size is not worth + // replacing. + if (DL.getTypeAllocSize(GV->getValueType()) <= 2) + return true; + + // LDS which is not used from non-kernel function scope or it is used from + // global scope does not qualify for replacement. + LDSToNonKernels[GV] = AMDGPU::collectNonKernelAccessorsOfLDS(GV); + if (LDSToNonKernels[GV].empty()) + return true; + + // FIXME: Any other scenarios which disqualify LDS from replacement? + + return false; + } + + // Insert new global LDS pointer which points to LDS. + GlobalVariable *createLDSPointer(GlobalVariable *GV) { + // LDS pointer which points to LDS is already created? return it. + auto Entry = LDSToPointer.insert(std::make_pair(GV, nullptr)); + if (!Entry.second) + return Entry.first->second; + + // We need to create new LDS pointer which points to LDS. + // + // Each CU owns 64K of LDS memory, so address ranges from 0 to 2^16 - 1. + // Hence 16 bit pointer is enough to hold the LDS address. + auto *I16Ty = Type::getInt16Ty(Ctx); + GlobalVariable *LDSPointer = new GlobalVariable( + M, I16Ty, false, GlobalValue::InternalLinkage, UndefValue::get(I16Ty), + GV->getName() + Twine(".pointer"), nullptr, + GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); + + LDSPointer->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + LDSPointer->setAlignment(AMDGPU::getAlign(DL, LDSPointer)); + + // Mark that an associated LDS pointer is created for LDS. + LDSToPointer[GV] = LDSPointer; + + return LDSPointer; + } + + // Within given kernel, initialize given LDS pointer to point to given LDS. + void initializeLDSPointer(Function *K, GlobalVariable *GV, + GlobalVariable *LDSPointer) { + // If LDS pointer is already initialized within K, then nothing to do. + auto Entry = KernelToLDSPointers.insert( + std::make_pair(K, SmallPtrSet())); + if (!Entry.second) + if (Entry.first->second.contains(LDSPointer)) + return; + + // Insert instructions at EI which initialize LDS pointer to point-to LDS + // within kernel K. + // + // That is, convert pointer type of GV to i16, and then store this converted + // i16 value within LDSPointer which is of type i16*. + auto *EI = &(*(K->getEntryBlock().getFirstInsertionPt())); + IRBuilder<> Builder(EI); + Builder.CreateStore(Builder.CreatePtrToInt(GV, Type::getInt16Ty(Ctx)), + LDSPointer); + + // Mark that LDS pointer is initialized within kernel K. + KernelToLDSPointers[K].insert(LDSPointer); + } + + // We have created an LDS pointer for LDS, and initialized it to point-to LDS + // within all relevent kernels. Now replace all the uses of LDS within + // non-kernel functions by LDS pointer. + void replaceLDSUseByPointer(GlobalVariable *GV, GlobalVariable *LDSPointer) { + SmallVector LDSUsers(GV->users()); + for (auto *U : LDSUsers) { + // When `U` is a constant expression, it is possible that same constant + // expression exists within multiple instructions, and within multiple + // non-kernel functions. Collect all those non-kernel functions and all + // those instructions within which `U` exist. + auto FunctionToInsts = AMDGPU::getNonKernelFunctionToInstsMap(U); + + for (auto FI = FunctionToInsts.begin(), FE = FunctionToInsts.end(); + FI != FE; ++FI) { + Function *F = FI->first; + auto InstSet = FI->second; + for (auto *I : InstSet) { + // If `U` is a constant expression, then we need to break the + // associated instruction into a set of separate instructions by + // converting constant expressions into instructions. + SmallPtrSet UserInsts; + + if (U == I) { + // `U` is an instruction, conversion from constant expression to + // set of instructions is *not* required. + UserInsts.insert(I); + } else { + // `U` is a constant expression, convert it into corresponding set + // of instructions. + auto *CE = cast(U); + AMDGPU::convertConstExprsToInstructions(I, CE, UserInsts); + } + + // Go through all the user instrutions, if LDS exist within them as an + // operand, then replace it by replace instruction. + for (auto *II : UserInsts) { + auto *ReplaceInst = getReplacementInst(F, GV, LDSPointer); + AMDGPU::updateUserOperand(II, GV, ReplaceInst); + } + } + } + } + } + + // Create a set of replacement instructions which together replace LDS within + // non-kernel function F by accessing LDS indirectly using LDS pointer. + Value *getReplacementInst(Function *F, GlobalVariable *GV, + GlobalVariable *LDSPointer) { + // If the instruction which replaces LDS within F is already created, then + // return it. + auto Entry = FunctionToLDSToReplaceInst.insert( + std::make_pair(F, DenseMap())); + if (!Entry.second) { + auto Entry2 = Entry.first->second.insert(std::make_pair(GV, nullptr)); + if (!Entry2.second) + return Entry2.first->second; + } + + // Get the instruction insertion point within the beginning of the entry + // block of current non-kernel function. + auto *EI = &(*(F->getEntryBlock().getFirstInsertionPt())); + IRBuilder<> Builder(EI); + + // Insert required set of instructions which replace LDS within F. + auto *V = Builder.CreateBitCast( + Builder.CreateGEP( + LDSMemBaseAddr, + Builder.CreateLoad(LDSPointer->getValueType(), LDSPointer)), + GV->getType()); + + // Mark that the replacement instruction which replace LDS within F is + // created. + FunctionToLDSToReplaceInst[F][GV] = V; + + return V; + } + +public: + ReplaceLDSUseWithPointer(Module &M, SmallPtrSet &UList) + : M(M), Ctx(M.getContext()), DL(M.getDataLayout()), UsedList(UList) { + LDSMemBaseAddr = Constant::getIntegerValue( + PointerType::get(Type::getInt8Ty(M.getContext()), + AMDGPUAS::LOCAL_ADDRESS), + APInt(32, 0)); + } + + // Entry-point function which interface ReplaceLDSUseWithPointer with outside + // of the class. + bool replaceLDSUse(); + +private: + // For a given LDS from collected LDS globals set, replace its non-kernel + // function scope use by pointer. + bool replaceLDSUse(GlobalVariable *GV); +}; + +// For given LDS from collected LDS globals set, replace its non-kernel function +// scope use by pointer. +bool ReplaceLDSUseWithPointer::replaceLDSUse(GlobalVariable *GV) { + // Holds all those non-kernel functions within which LDS is being accessed. + SmallPtrSet LDSAccessors = LDSToNonKernels[GV]; + + // The LDS pointer which points to LDS and replaces all the uses of LDS. + GlobalVariable *LDSPointer = nullptr; + + // Traverse through each kernel K, check and if required, initialize the + // LDS pointer to point to LDS within K. + for (auto KI = KernelToCallees.begin(), KE = KernelToCallees.end(); KI != KE; + ++KI) { + Function *K = KI->first; + SmallPtrSet Callees = KI->second; + + // Compute reachable and LDS used callees for kernel K. + set_intersect(Callees, LDSAccessors); + + // None of the LDS accessing non-kernel functions are reachable from + // kernel K. Hence, no need to initialize LDS pointer within kernel K. + if (Callees.empty()) + continue; + + // We have found reachable and LDS used callees for kernel K, and we need to + // initialize LDS pointer within kernel K, and we need to replace LDS use + // within those callees by LDS pointer. + // + // But, first check if LDS pointer is already created, if not create one. + LDSPointer = createLDSPointer(GV); + + // Initialize LDS pointer to point to LDS within kernel K. + initializeLDSPointer(K, GV, LDSPointer); + } + + // We have not found reachable and LDS used callees for any of the kernels, + // and hence we have not created LDS pointer. + if (!LDSPointer) + return false; + + // We have created an LDS pointer for LDS, and initialized it to point-to LDS + // within all relevent kernels. Now replace all the uses of LDS within + // non-kernel functions by LDS pointer. + replaceLDSUseByPointer(GV, LDSPointer); + + return true; +} + +// Entry-point function which interface ReplaceLDSUseWithPointer with outside of +// the class. +bool ReplaceLDSUseWithPointer::replaceLDSUse() { + // Collect LDS which requires their uses to be replaced by pointer. + if (!collectLDSRequiringPointerReplace()) + return false; + + // Collect reachable callee set for each kernel defined in the module. + AMDGPU::collectReachableCallees(M, KernelToCallees); + + if (KernelToCallees.empty()) { + // Either module does not have any kernel definitions, or none of the kernel + // has a call to non-kernel functions, or we could not resolve any of the + // call sites to proper non-kernel functions, because of the situations like + // inline asm calls. Nothing to replace. + return false; + } + + // For every LDS from collected LDS globals set, replace its non-kernel + // function scope use by pointer. + bool Changed = false; + for (auto *GV : LDSGlobals) + Changed |= replaceLDSUse(GV); + + return Changed; +} + +// Interface function between AMDGPULowerModuleLDS and ReplaceLDSUseWithPointer. +static bool replaceLDSUseWithPointer(Module &M, + SmallPtrSet &UsedList) { + ReplaceLDSUseWithPointer LDSUseReplacer{M, UsedList}; + return LDSUseReplacer.replaceLDSUse(); +} + class AMDGPULowerModuleLDS : public ModulePass { static void removeFromUsedList(Module &M, StringRef Name, @@ -144,10 +495,19 @@ } bool runOnModule(Module &M) override { + bool Changed = false; + UsedList = AMDGPU::getUsedList(M); - bool Changed = processUsedLDS(M); + // As a first step, before proceding with LDS lowering process, replace + // non-kernel function use of LDS by pointers. This will help reduce the + // memory overhead being introduced during lowering process. + Changed = replaceLDSUseWithPointer(M, UsedList); + // As a second step, perform module LDS lowering. + Changed |= processUsedLDS(M); + + // As a third step, perform kernel LDS lowering. for (Function &F : M.functions()) { if (!AMDGPU::isKernelCC(&F)) continue; @@ -155,6 +515,7 @@ } UsedList.clear(); + return Changed; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h @@ -14,11 +14,42 @@ #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULDSUTILS_H #include "AMDGPU.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/IR/Constants.h" namespace llvm { namespace AMDGPU { +// Collect reachable callees for each kernel defined in the module. +void collectReachableCallees( + Module &M, + DenseMap> &KernelToCallees); + +// For the given LDS, visit its user list and find all those non-kernel +// functions within which the LDS is being used. +SmallPtrSet collectNonKernelAccessorsOfLDS(GlobalVariable *GV); + +// Collect all those non-kernel functions and all those instructions within +// which the given user U exist. +DenseMap> +getNonKernelFunctionToInstsMap(User *U); + +// Given an instruction I which use given constant expression CE either directly +// or indirectly, return all such constant expression operands of I. +SmallPtrSet getCEOperands(Instruction *I, ConstantExpr *CE); + +// The instruction I contains const expressions(possibly nested) as its +// operands, convert those const expressions into corresponding instructions. +void convertConstExprsToInstructions(Instruction *I, + SmallPtrSetImpl &Operands, + SmallPtrSetImpl &UserInsts); +void convertConstExprsToInstructions(Instruction *I, ConstantExpr *CE, + SmallPtrSetImpl &UserInsts); + +// Within User U replace the use(s) of old value OldV by new value NewV. +void updateUserOperand(User *U, Value *OldV, Value *NewV); + bool isKernelCC(const Function *Func); Align getAlign(DataLayout const &DL, const GlobalVariable *GV); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp @@ -12,7 +12,8 @@ #include "AMDGPULDSUtils.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/IR/Constants.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/Analysis/CallGraph.h" using namespace llvm; @@ -20,6 +21,266 @@ namespace AMDGPU { +// An helper class for collecting all reachable callees for each kernel defined +// within the module. +class CollectReachableCallees { + Module &M; + CallGraph CG; + SmallPtrSet AddressTakenFunctions; + + // Collect all address taken functions within the module. + void collectAddressTakenFunctions() { + auto *ECNode = CG.getExternalCallingNode(); + + for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) { + auto *CGN = GI->second; + auto *F = CGN->getFunction(); + if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F)) + continue; + AddressTakenFunctions.insert(CGN); + } + } + + // Collect all reachable call graph nodes from given call graph node. + SmallPtrSet collectCGNodes(CallGraphNode *CGN) { + SmallPtrSet CGNodes; + + for (scc_iterator I = scc_begin(CGN); !I.isAtEnd(); ++I) { + const std::vector &SCC = *I; + assert(!SCC.empty() && "SCC with no functions?"); + for (auto *CGNode : SCC) + CGNodes.insert(CGNode); + } + + return CGNodes; + } + + // For given kernel, collect all its reachable non-kernel functions. + SmallPtrSet collectReachableCallees(Function *K) { + SmallPtrSet ReachableCallees; + + // Call graph node which represents this kernel. + auto *KCGN = CG[K]; + + // Collect all reachable call graph nodes from the node representing this + // kernel. + SmallPtrSet CGNodes = collectCGNodes(KCGN); + + // Go through collected reachable nodes, visit all thier call sites, if the + // call site is direct, add corresponding callee to reachable callee set, if + // it is indirect, resolve the indirect call site to potential reachable + // callees, add them to reachable callee set, and repeat the process for the + // newly added potential callee nodes. + // + // FIXME: Need to handle bit-casted function pointers. + // + SmallVector CGNStack(CGNodes.begin(), CGNodes.end()); + SmallPtrSet VisitedCGNodes; + while (!CGNStack.empty()) { + auto *CGN = CGNStack.pop_back_val(); + + if (!VisitedCGNodes.insert(CGN).second) + continue; + + for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) { + auto *RCB = cast(GI->first.getValue()); + auto *RCGN = GI->second; + + if (auto *DCallee = RCGN->getFunction()) { + ReachableCallees.insert(DCallee); + } else if (RCB->isIndirectCall()) { + auto *RCBFTy = RCB->getFunctionType(); + for (auto *ACGN : AddressTakenFunctions) { + auto *ACallee = ACGN->getFunction(); + if (ACallee->getFunctionType() == RCBFTy) { + ReachableCallees.insert(ACallee); + SmallPtrSet IGCNNodes = collectCGNodes(ACGN); + for (auto *IGCN : IGCNNodes) + CGNStack.push_back(IGCN); + } + } + } + } + } + + return ReachableCallees; + } + +public: + explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) { + // Collect address taken functions. + collectAddressTakenFunctions(); + } + + void collectReachableCallees( + DenseMap> &KernelToCallees) { + // Collect reachable callee set for each kernel defined in the module. + for (Function &F : M.functions()) { + if (!AMDGPU::isKernelCC(&F)) + continue; + Function *K = &F; + KernelToCallees[K] = collectReachableCallees(K); + } + } +}; + +// Collect reachable callees for each kernel defined in the module. +void collectReachableCallees( + Module &M, + DenseMap> &KernelToCallees) { + CollectReachableCallees CRC{M}; + CRC.collectReachableCallees(KernelToCallees); +} + +// For the given LDS, visit its user list and find all those non-kernel +// functions within which the LDS is being used. +SmallPtrSet collectNonKernelAccessorsOfLDS(GlobalVariable *GV) { + SmallPtrSet LDSAccessors; + SmallVector UserStack(GV->users()); + SmallPtrSet VisitedUsers; + + while (!UserStack.empty()) { + auto *U = UserStack.pop_back_val(); + + // `U` is already visited? continue to next one. + if (!VisitedUsers.insert(U).second) + continue; + + // `U` is a global variable which is initialized with LDS. Ignore LDS. + if (isa(U)) + return SmallPtrSet(); + + // Recursively explore constant users. + if (isa(U)) { + append_range(UserStack, U->users()); + continue; + } + + // `U` should be an instruction, if it belongs to a non-kernel function F, + // then collect F. + Function *F = cast(U)->getFunction(); + if (!AMDGPU::isKernelCC(F)) + LDSAccessors.insert(F); + } + + return LDSAccessors; +} + +// Collect all those non-kernel functions and all those instructions within +// which the given user U exist. +DenseMap> +getNonKernelFunctionToInstsMap(User *U) { + DenseMap> FunctionToInsts; + SmallVector UserStack; + SmallPtrSet VisitedUsers; + + UserStack.push_back(U); + + while (!UserStack.empty()) { + auto *UU = UserStack.pop_back_val(); + + if (!VisitedUsers.insert(UU).second) + continue; + + if (isa(UU)) + continue; + + if (isa(UU)) { + append_range(UserStack, UU->users()); + continue; + } + + auto *I = cast(UU); + Function *F = I->getFunction(); + if (!AMDGPU::isKernelCC(F)) { + FunctionToInsts.insert( + std::make_pair(F, SmallPtrSet())); + FunctionToInsts[F].insert(I); + } + } + + return FunctionToInsts; +} + +// Given an instruction I which use given constant expression CE either directly +// or indirectly, return all such constant expression operands of I. +SmallPtrSet getCEOperands(Instruction *I, ConstantExpr *CE) { + SmallPtrSet CEOperands; + + for (Use &UU : I->operands()) { + auto *CE2 = dyn_cast(UU.get()); + if (!CE2) + continue; + + if (CE2 != CE) { + SmallVector Stack; + Stack.push_back(CE2); + + while (!Stack.empty()) { + Value *V = Stack.pop_back_val(); + if (auto *CE3 = dyn_cast(V)) { + if (CE3 == CE) { + CEOperands.insert(UU.get()); + break; + } else { + for (Use &UU : CE3->operands()) + Stack.push_back(UU.get()); + } + } + } + } else + CEOperands.insert(UU.get()); + } + + return CEOperands; +} + +// Convert const expression operands of I which are collected in Operands to +// corresponding instructions. +void convertConstExprsToInstructions( + Instruction *I, SmallPtrSetImpl &Operands, + SmallPtrSetImpl &UserInsts) { + for (auto *V : Operands) { + auto *CE = dyn_cast(V); + if (!CE) + continue; + + auto *NI = CE->getAsInstruction(); + NI->insertBefore(I); + updateUserOperand(I, CE, NI); + CE->removeDeadConstantUsers(); + UserInsts.insert(NI); + + SmallPtrSet Operands2; + for (Use &UU : CE->operands()) + Operands2.insert(UU.get()); + convertConstExprsToInstructions(NI, Operands2, UserInsts); + } +} + +// The instruction I contains const expressions(possibly nested) as its +// operands, collect those const expressions operands and convert them to +// corresponding instructions. +void convertConstExprsToInstructions( + Instruction *I, ConstantExpr *CE, + SmallPtrSetImpl &UserInsts) { + // Get all operands of I which use CE either directly or indirectly. + SmallPtrSet CEOperands = AMDGPU::getCEOperands(I, CE); + + // Convert const expressions operands of I to instructions. + convertConstExprsToInstructions(I, CEOperands, UserInsts); +} + +// Within User U replace the use(s) of old value OldV by new value NewV. +void updateUserOperand(User *U, Value *OldV, Value *NewV) { + unsigned Ind = 0; + for (Use &UU : U->operands()) { + if (UU.get() == OldV) + U->setOperand(Ind, NewV); + ++Ind; + } +} + bool isKernelCC(const Function *Func) { return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv()); } diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll @@ -3,7 +3,8 @@ ; Padding to meet alignment, so references to @var1 replaced with gep ptr, 0, 2 ; No i64 as addrspace(3) types with initializers are ignored. Likewise no addrspace(4). -; CHECK: %llvm.amdgcn.module.lds.t = type { float, [4 x i8], i32 } +; CHECK: %llvm.amdgcn.module.lds.t = type { i32, i16 } +; CHECK: %llvm.amdgcn.kernel.kern_call.lds.t = type { float } ; Variables removed by pass ; CHECK-NOT: @var0 @@ -12,22 +13,32 @@ @var0 = addrspace(3) global float undef, align 8 @var1 = addrspace(3) global i32 undef, align 8 +; Pointer created for @var0 should have lowered. +; CHECK-NOT: @var0.pointer + +; CHECK: @ptr = addrspace(1) global i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 4 @ptr = addrspace(1) global i32 addrspace(3)* @var1, align 4 ; A variable that is unchanged by pass ; CHECK: @with_init = addrspace(3) global i64 0 @with_init = addrspace(3) global i64 0 -; Instance of new type, aligned to max of element alignment +; Module and kernel LDS instances. ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8 +; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +; CHECK: @llvm.amdgcn.kernel.kern_call.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_call.lds.t undef, align 8 ; Use in func rewritten to access struct at address zero ; CHECK-LABEL: @func() -; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 1.0 -; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 4 -; CHECK: %val1 = add i32 %val0, 4 -; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 4 -; CHECK: %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic +; CHECK: %1 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2 +; CHECK: %2 = getelementptr i8, i8 addrspace(3)* null, i16 %1 +; CHECK: %3 = bitcast i8 addrspace(3)* %2 to float addrspace(3)* +; CHECK: %dec = atomicrmw fsub float addrspace(3)* %3, float 1.000000e+00 monotonic, align 4 +; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 4 +; CHECK: %val1 = add i32 %val0, 4 +; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 4 +; CHECK: %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic, align 8 +; CHECK: ret void define void @func() { %dec = atomicrmw fsub float addrspace(3)* @var0, float 1.0 monotonic %val0 = load i32, i32 addrspace(3)* @var1, align 4 @@ -39,9 +50,11 @@ ; This kernel calls a function that uses LDS so needs the block ; CHECK-LABEL: @kern_call() -; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] -; CHECK: call void @func() -; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 2.000000e+00 monotonic, align 4 +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: store i16 ptrtoint (%llvm.amdgcn.kernel.kern_call.lds.t addrspace(3)* @llvm.amdgcn.kernel.kern_call.lds to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2 +; CHECK: call void @func() +; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.kern_call.lds.t, %llvm.amdgcn.kernel.kern_call.lds.t addrspace(3)* @llvm.amdgcn.kernel.kern_call.lds, i32 0, i32 0), float 2.000000e+00 monotonic, align 4 +; CHECK: ret void define amdgpu_kernel void @kern_call() { call void @func() %dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-diamond-shape.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-diamond-shape.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-diamond-shape.ll @@ -0,0 +1,80 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; @lds_used_within_func is used within @func_uses_lds which is recheable from kernel +; @kernel_reaches_lds, hence pointer replacement takes place for @lds_used_within_func. +; CHECK: %llvm.amdgcn.module.lds.t = type { i16 } +; CHECK: %llvm.amdgcn.kernel.kernel_reaches_lds.lds.t = type { [4 x i32] } +; CHECK: %llvm.amdgcn.kernel.kernel_does_not_reach_lds.lds.t = type { %llvm.amdgcn.kernel.kernel_reaches_lds.lds.t } + +; Original LDS should not exist +; CHECK-NOT: @lds_used_within_func +@lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4 + +; Pointer should not exist +; CHECK-NOT: @lds_used_within_func.pointer + +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 2 +; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +; CHECK: @llvm.amdgcn.kernel.kernel_reaches_lds.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kernel_reaches_lds.lds.t undef, align 4 +; CHECK: @llvm.amdgcn.kernel.kernel_does_not_reach_lds.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kernel_does_not_reach_lds.lds.t undef, align 4 + +define internal void @func_uses_lds() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_func, i32 0, i32 0 + ret void +} + +define internal void @func_does_not_use_lds_3() { +; CHECK-LABEL: entry: +; CHECK: call void @func_uses_lds() +; CHECK: ret void +entry: + call void @func_uses_lds() + ret void +} + +define internal void @func_does_not_use_lds_2() { +; CHECK-LABEL: entry: +; CHECK: call void @func_uses_lds() +; CHECK: ret void +entry: + call void @func_uses_lds() + ret void +} + +define internal void @func_does_not_use_lds_1() { +; CHECK-LABEL: entry: +; CHECK: call void @func_does_not_use_lds_2() +; CHECK: call void @func_does_not_use_lds_3() +; CHECK: ret void +entry: + call void @func_does_not_use_lds_2() + call void @func_does_not_use_lds_3() + ret void +} + +define protected amdgpu_kernel void @kernel_reaches_lds() { +; CHECK-LABEL: entry: +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: store i16 ptrtoint (%llvm.amdgcn.kernel.kernel_reaches_lds.lds.t addrspace(3)* @llvm.amdgcn.kernel.kernel_reaches_lds.lds to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2 +; CHECK: call void @func_does_not_use_lds_1() +; CHECK: ret void +entry: + call void @func_does_not_use_lds_1() + ret void +} + +define protected amdgpu_kernel void @kernel_does_not_reach_lds() { +; CHECK-LABEL: entry: +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: ret void +entry: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-selected_functions.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-selected_functions.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-selected_functions.ll @@ -0,0 +1,96 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; All the three lds require pointer replacement since functions which access them are reachable. +; CHECK: %llvm.amdgcn.module.lds.t = type { i16, i16, i16 } + +; FIXME: Original LDS should have lowered by kernel lowering phase. But it is not. +@lds_used_within_function_1 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds_used_within_function_2 = internal addrspace(3) global [2 x i32] undef, align 4 +@lds_used_within_function_3 = internal addrspace(3) global [3 x i32] undef, align 4 + +; Pointers should have lowered. +; CHECK-NOT: @lds_used_within_function_1.pointer +; CHECK-NOT: @lds_used_within_function_2.pointer +; CHECK-NOT: @lds_used_within_function_3.pointer + +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 2 +; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata" + +define internal void @function_3() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [3 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [3 x i32], [3 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [3 x i32], [3 x i32] addrspace(3)* @lds_used_within_function_3, i32 0, i32 0 + ret void +} + +define internal void @function_2() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [2 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @lds_used_within_function_2, i32 0, i32 0 + ret void +} + +define internal void @function_1() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [1 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_used_within_function_1, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @kernel_calls_function_3_and_1() { +; CHECK-LABEL: entry: +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: store i16 ptrtoint ([3 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 2 +; CHECK: store i16 ptrtoint ([1 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2 +; CHECK: call void @function_3() +; CHECK: call void @function_1() +; CHECK: ret void +entry: + call void @function_3() + call void @function_1() + ret void +} + +define protected amdgpu_kernel void @kernel_calls_function_2_and_3() { +; CHECK-LABEL: entry: +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: store i16 ptrtoint ([3 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 2 +; CHECK: store i16 ptrtoint ([2 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2 +; CHECK: call void @function_2() +; CHECK: call void @function_3() +; CHECK: ret void +entry: + call void @function_2() + call void @function_3() + ret void +} + +define protected amdgpu_kernel void @kernel_calls_function_1_and_2() { +; CHECK-LABEL: entry: +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: store i16 ptrtoint ([2 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2 +; CHECK: store i16 ptrtoint ([1 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2 +; CHECK: call void @function_1() +; CHECK: call void @function_2() +; CHECK: ret void +entry: + call void @function_1() + call void @function_2() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll @@ -0,0 +1,39 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; DESCRIPTION ; +; +; LDS global @globaly_used_lds is used within non-kernel function @func, and @func is reachable +; from kernel @kern, but since @globaly_used_lds is also used in global scope as an initializer +; @global_var replacement does not take place. Instead @globaly_used_lds is directly lowered. +; + +; CHECK-NOT: %llvm.amdgcn.module.lds.t = type { i16 } +; CHECK: %llvm.amdgcn.module.lds.t = type { [4 x i32] } + +; CHECK-NOT: @globaly_used_lds +; CHECK: @global_var = addrspace(1) global float* addrspacecast (float addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to float addrspace(3)*) to float*), align 8 +@globaly_used_lds = internal addrspace(3) global [4 x i32] undef, align 4 +@global_var = addrspace(1) global float* addrspacecast ([4 x i32] addrspace(3)* @globaly_used_lds to float*), align 8 + +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4 +; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata" + +define void @func() { +; CHECK-LABEL: entry: +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @globaly_used_lds, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @kernel() { +; CHECK-LABEL: entry: +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: call void @func() +; CHECK: ret void +entry: + call void @func() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll @@ -0,0 +1,34 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; We do not know what to do with inline asm call, we ignore it, hence @used_only_within_func +; is directly lowered. + +; CHECK-NOT: %llvm.amdgcn.module.lds.t = type { i16 } +; CHECK: %llvm.amdgcn.module.lds.t = type { [4 x i32] } + +; CHECK-NOT: @used_only_within_func +@used_only_within_func = addrspace(3) global [4 x i32] undef, align 4 + +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4 +; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata" + +define void @f0(i32 %x) { +; CHECK-LABEL: entry: +; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, +; CHECK: i32 0, i32 0, i32 0) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0, i32 0) to i32*) to i64)) to i32*), align 4 +; CHECK: ret void +entry: + store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + ret void +} + +define amdgpu_kernel void @k0() { +; CHECK-LABEL: entry: +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: %0 = call i32 asm "s_mov_b32 $0, 0", "=s"() +; CHECK: ret void +entry: + call i32 asm "s_mov_b32 $0, 0", "=s"() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll @@ -0,0 +1,29 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; DESCRIPTION ; +; +; LDS global @used_only_within_kern is used only within kernel @k0, hence this variable +; is untouched by both pointer replacement and module lowering phase, but kernel lowering +; phase lowers it. +; + +; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [4 x i32] } + +; CHECK-NOT: @used_only_within_kern +@used_only_within_kern = addrspace(3) global [4 x i32] undef, align 4 + +; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 4 + +define amdgpu_kernel void @k0() { +; CHECK-LABEL: entry: +; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0, i32 0) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0, i32 0) to i32*) to i64)) to i32*), align 4 +; CHECK: %mul = mul i32 %ld, 2 +; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0, i32 0) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0, i32 0) to i32*) to i64)) to i32*), align 4 +; CHECK: ret void +entry: + %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + %mul = mul i32 %ld, 2 + store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll @@ -0,0 +1,38 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; DESCRIPTION ; +; +; LDS global @not-reachable-lds is used within non-kernel function @func, but @func is *not* +; reachable from kernel @kern, hence pointer replacement does not take place. Instead +; @not-reachable-lds is directly lowered. +; +; FIXME: Strictly speaking module lds lowering phase should not lower @not-reachable-lds, but +; at the moment, we are taking a conservative decision to lower all non-kernel used LDS. On the +; other-hand we may not exactly know what to do for these cases. + +; CHECK-NOT: %llvm.amdgcn.module.lds.t = type { i16 } +; CHECK: %llvm.amdgcn.module.lds.t = type { [4 x i32] } + +; CHECK-NOT: @not-reachable-lds +@not-reachable-lds = internal addrspace(3) global [4 x i32] undef, align 4 + +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4 +; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata" + +define internal void @func() { +; CHECK-LABEL: entry: +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @not-reachable-lds, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @kernel() { +; CHECK-LABEL: entry: +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: ret void +entry: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll @@ -0,0 +1,37 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; DESCRIPTION ; +; +; LDS global @small_lds is used within non-kernel function @func, and @func is reachable +; from kernel @kern, but since @small_lds too small for pointer replacement, pointer +; replacement does not take place. Instead @small_lds is directly lowered. +; + +; CHECK-NOT: %llvm.amdgcn.module.lds.t = type { i16 } +; CHECK: %llvm.amdgcn.module.lds.t = type { i8 } + +; CHECK-NOT: @small_lds +@small_lds = addrspace(3) global i8 undef, align 1 + +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 1 +; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8*)], section "llvm.metadata" + +define void @func() { +; CHECK-LABEL: entry: +; CHECK: store i8 1, i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 1 +; CHECK: ret void +entry: + store i8 1, i8 addrspace(3)* @small_lds, align 1 + ret void +} + +define amdgpu_kernel void @kern() { +; CHECK-LABEL: entry: +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: call void @func() +; CHECK: ret void +entry: + call void @func() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-diamond-shape.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-diamond-shape.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-diamond-shape.ll @@ -0,0 +1,88 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; @lds_used_within_func is used within @func_uses_lds which is *indirectly* recheable from +; kernel @kernel_reaches_lds, hence pointer replacement takes place for @lds_used_within_func. +; CHECK: %llvm.amdgcn.module.lds.t = type { i16 } +; CHECK: %llvm.amdgcn.kernel.kernel_reaches_lds.lds.t = type { [4 x i32] } +; CHECK: %llvm.amdgcn.kernel.kernel_does_not_reach_lds.lds.t = type { %llvm.amdgcn.kernel.kernel_reaches_lds.lds.t } + +; Original LDS should not exit. +; CHECK-NOT: @lds_used_within_func +@lds_used_within_func = internal addrspace(3) global [4 x i32] undef, align 4 + +; Pointer should not exist. +; CHECK-NOT: @lds_used_within_func.pointer + +; CHECK: @ptr_to_func = internal local_unnamed_addr externally_initialized global void ()* @func_uses_lds, align 8 +@ptr_to_func = internal local_unnamed_addr externally_initialized global void ()* @func_uses_lds, align 8 + +; Module and kernel LDS +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 2 +; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +; CHECK: @llvm.amdgcn.kernel.kernel_reaches_lds.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kernel_reaches_lds.lds.t undef, align 4 +; CHECK: @llvm.amdgcn.kernel.kernel_does_not_reach_lds.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kernel_does_not_reach_lds.lds.t undef, align 4 + +define internal void @func_uses_lds() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_func, i32 0, i32 0 + ret void +} + +define internal void @func_does_not_use_lds_3() { +; CHECK-LABEL: entry: +; CHECK: %fptr = load void ()*, void ()** @ptr_to_func, align 8 +; CHECK: call void %fptr() +; CHECK: ret void +entry: + %fptr = load void ()*, void ()** @ptr_to_func, align 8 + call void %fptr() + ret void +} + +define internal void @func_does_not_use_lds_2() { +; CHECK-LABEL: entry: +; CHECK: %fptr = load void ()*, void ()** @ptr_to_func, align 8 +; CHECK: call void %fptr() +; CHECK: ret void +entry: + %fptr = load void ()*, void ()** @ptr_to_func, align 8 + call void %fptr() + ret void +} + +define internal void @func_does_not_use_lds_1() { +; CHECK-LABEL: entry: +; CHECK: call void @func_does_not_use_lds_2() +; CHECK: call void @func_does_not_use_lds_3() +; CHECK: ret void +entry: + call void @func_does_not_use_lds_2() + call void @func_does_not_use_lds_3() + ret void +} + +define protected amdgpu_kernel void @kernel_reaches_lds() { +; CHECK-LABEL: entry: +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: store i16 ptrtoint (%llvm.amdgcn.kernel.kernel_reaches_lds.lds.t addrspace(3)* @llvm.amdgcn.kernel.kernel_reaches_lds.lds to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2 +; CHECK: call void @func_does_not_use_lds_1() +; CHECK: ret void +entry: + call void @func_does_not_use_lds_1() + ret void +} + +define protected amdgpu_kernel void @kernel_does_not_reach_lds() { +; CHECK-LABEL: entry: +; CHECK-LABEL: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: ret void +entry: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-misc1.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-misc1.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-misc1.ll @@ -0,0 +1,81 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; Since indirect call site resolves all the three functions as potential callees (because signature matches), +; all the LDS require pointer replacement. +; CHECK: %llvm.amdgcn.module.lds.t = type { i16, i16, i16 } +; CHECK: %llvm.amdgcn.kernel.kernel_calls_function_1_and_2.lds.t = type { [4 x i32], [4 x i32], [4 x i32] } + +; CHECK-NOT: @lds_used_within_function_1 +; CHECK-NOT: @lds_used_within_function_2 +; CHECK-NOT: @lds_used_within_function_3 +@lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4 + +; CHECK-NOT: @lds_used_within_function_1.pointer +; CHECK-NOT: @lds_used_within_function_2.pointer +; CHECK-NOT: @lds_used_within_function_3.pointer + +; CHECK: @ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (i16)* @function_1, align 8 +; CHECK: @ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8 +; CHECK: @ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i16)* @function_3, align 8 +@ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (i16)* @function_1, align 8 +@ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8 +@ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i16)* @function_3, align 8 + +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 2 +; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +; CHECK: @llvm.amdgcn.kernel.kernel_calls_function_1_and_2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kernel_calls_function_1_and_2.lds.t undef, align 4 + +define internal void @function_3(i16 %i) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_3, i32 0, i32 0 + ret void +} + +define internal void @function_2(i16 %i) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_2, i32 0, i32 0 + ret void +} + +define internal void @function_1(i16 %i) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_1, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @kernel_calls_function_1_and_2() { +; CHECK-LABEL: entry: +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.kernel_calls_function_1_and_2.lds.t, %llvm.amdgcn.kernel.kernel_calls_function_1_and_2.lds.t addrspace(3)* @llvm.amdgcn.kernel.kernel_calls_function_1_and_2.lds, i32 0, i32 2) to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, +; CHECK: i32 2), align 2 +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.kernel_calls_function_1_and_2.lds.t, %llvm.amdgcn.kernel.kernel_calls_function_1_and_2.lds.t addrspace(3)* @llvm.amdgcn.kernel.kernel_calls_function_1_and_2.lds, i32 0, i32 1) to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2 +; CHECK: store i16 ptrtoint (%llvm.amdgcn.kernel.kernel_calls_function_1_and_2.lds.t addrspace(3)* @llvm.amdgcn.kernel.kernel_calls_function_1_and_2.lds to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2 +; CHECK: %fptr1 = load void (i16)*, void (i16)** @ptr_to_func1, align 8 +; CHECK: call void %fptr1(i16 6) +; CHECK: ret void +entry: + %fptr1 = load void (i16)*, void (i16)** @ptr_to_func1, align 8 + call void %fptr1(i16 6) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-selected_functions.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-selected_functions.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-selected_functions.ll @@ -0,0 +1,112 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; All the three lds require pointer replacement since functions which access them are +; *indirectly* reachable. +; CHECK: %llvm.amdgcn.module.lds.t = type { i16, i16, i16 } + +; FIXME: Original LDS should have lowered by kernel lowering. +@lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4 +@lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4 + +; CHECK-NOT: @lds_used_within_function_1.pointer +; CHECK-NOT: @lds_used_within_function_2.pointer +; CHECK-NOT: @lds_used_within_function_3.pointer + +; CHECK: @ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (float)* @function_1, align 8 +; CHECK: @ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8 +; CHECK: @ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i8)* @function_3, align 8 +@ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (float)* @function_1, align 8 +@ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8 +@ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i8)* @function_3, align 8 + +define internal void @function_3(i8 %c) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_3, i32 0, i32 0 + ret void +} + +define internal void @function_2(i16 %i) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_2, i32 0, i32 0 + ret void +} + +define internal void @function_1(float %f) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_1, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @kernel_calls_function_3_and_1() { +; CHECK-LABEL: entry: +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 2 +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2 +; CHECK: %fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8 +; CHECK: %fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8 +; CHECK: call void %fptr3(i8 1) +; CHECK: call void %fptr1(float 2.000000e+00) +; CHECK: ret void +entry: + %fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8 + %fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8 + call void %fptr3(i8 1) + call void %fptr1(float 2.0) + ret void +} + +define protected amdgpu_kernel void @kernel_calls_function_2_and_3() { +; CHECK-LABEL: entry: +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 2 +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2 +; CHECK: %fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8 +; CHECK: %fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8 +; CHECK: call void %fptr2(i16 3) +; CHECK: call void %fptr3(i8 4) +; CHECK: ret void +entry: + %fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8 + %fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8 + call void %fptr2(i16 3) + call void %fptr3(i8 4) + ret void +} + +define protected amdgpu_kernel void @kernel_calls_function_1_and_2() { +; CHECK-LABEL: entry: +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2 +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2 +; CHECK: %fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8 +; CHECK: %fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8 +; CHECK: call void %fptr1(float 5.000000e+00) +; CHECK: call void %fptr2(i16 6) +; CHECK: ret void +entry: + %fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8 + %fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8 + call void %fptr1(float 5.0) + call void %fptr2(i16 6) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-multiple-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-multiple-lds.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-multiple-lds.ll @@ -0,0 +1,58 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; All the three lds require pointer replacement since only function which access them is reachable. +; CHECK: %llvm.amdgcn.module.lds.t = type { i16, i16, i16 } +; CHECK: %llvm.amdgcn.kernel.kernel.lds.t = type { [3 x i32], [2 x i32], [1 x i32] } + +; Original LDS should not exist. +; CHECK-NOT: @lds1 +; CHECK-NOT: @lds2 +; CHECK-NOT: @lds3 +@lds1 = internal addrspace(3) global [1 x i32] undef, align 4 +@lds2 = internal addrspace(3) global [2 x i32] undef, align 4 +@lds3 = internal addrspace(3) global [3 x i32] undef, align 4 + +; Pointers should not exist +; CHECK-NOT: @lds1.pointer +; CHECK-NOT: @lds2.pointer +; CHECK-NOT: @lds3.pointer + +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 2 +; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +; CHECK: @llvm.amdgcn.kernel.kernel.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kernel.lds.t undef, align 4 + +define internal void @function() { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [3 x i32] addrspace(3)* +; CHECK: %3 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2 +; CHECK: %4 = getelementptr i8, i8 addrspace(3)* null, i16 %3 +; CHECK: %5 = bitcast i8 addrspace(3)* %4 to [2 x i32] addrspace(3)* +; CHECK: %6 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2 +; CHECK: %7 = getelementptr i8, i8 addrspace(3)* null, i16 %6 +; CHECK: %8 = bitcast i8 addrspace(3)* %7 to [1 x i32] addrspace(3)* +; CHECK: %gep1 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %8, i32 0, i32 0 +; CHECK: %gep2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* %5, i32 0, i32 0 +; CHECK: %gep3 = getelementptr inbounds [3 x i32], [3 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: ret void +entry: + %gep1 = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds1, i32 0, i32 0 + %gep2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @lds2, i32 0, i32 0 + %gep3 = getelementptr inbounds [3 x i32], [3 x i32] addrspace(3)* @lds3, i32 0, i32 0 + ret void +} + +define protected amdgpu_kernel void @kernel() { +; CHECK-LABEL: entry: +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: store i16 ptrtoint (%llvm.amdgcn.kernel.kernel.lds.t addrspace(3)* @llvm.amdgcn.kernel.kernel.lds to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 2 +; CHECK: store i16 ptrtoint ([2 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.kernel.lds.t, %llvm.amdgcn.kernel.kernel.lds.t addrspace(3)* @llvm.amdgcn.kernel.kernel.lds, i32 0, i32 1) to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2 +; CHECK: store i16 ptrtoint ([1 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.kernel.lds.t, %llvm.amdgcn.kernel.kernel.lds.t addrspace(3)* @llvm.amdgcn.kernel.kernel.lds, i32 0, i32 2) to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2 +; CHECK: call void @function() +; CHECK: ret void +entry: + call void @function() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-func.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-func.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-func.ll @@ -0,0 +1,46 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; Module struct should have pointer as a member instead of original LDS +; CHECK: %llvm.amdgcn.module.lds.t = type { i16 } +; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [4 x i32] } + +; Original LDS should not exist. +; CHECK-NOT: @used_only_within_func +@used_only_within_func = addrspace(3) global [4 x i32] undef, align 4 + +; Pointer should not exist. +; CHECK-NOT: @used_only_within_func.pointer + +; Module and kernel LDS. +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 2 +; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 4 + +define void @f0(i32 %x) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: %4 = addrspacecast i32 addrspace(3)* %3 to i32* +; CHECK: %5 = ptrtoint i32* %4 to i64 +; CHECK: %6 = add i64 %5, %5 +; CHECK: %7 = inttoptr i64 %6 to i32* +; CHECK: store i32 %x, i32* %7, align 4 +; CHECK: ret void +entry: + store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + ret void +} + +define amdgpu_kernel void @k0() { +; CHECK-LABEL: entry: +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: store i16 ptrtoint (%llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2 +; CHECK: call void @f0(i32 0) +; CHECK: ret void +entry: + call void @f0(i32 0) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-kern-and-func.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-kern-and-func.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-kern-and-func.ll @@ -0,0 +1,55 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +; Module struct should have pointer as a member instead of original LDS @used_within_both_func_and_kern +; CHECK: %llvm.amdgcn.module.lds.t = type { i16 } +; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [4 x i32], [4 x i32] } + +; Original LDS should not exist. +; CHECK-NOT: @used_only_within_kern +; CHECK-NOT: @used_within_both_func_and_kern +@used_only_within_kern = addrspace(3) global [4 x i32] undef, align 4 +@used_within_both_func_and_kern = addrspace(3) global [4 x i32] undef, align 4 + +; Pointer should not exist +; CHECK-NOT: @used_only_within_kern.pointer +; CHECK-NOT: @used_within_both_func_and_kern.pointer + +; Module and kernel LDS +; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 2 +; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata" +; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 4 + +define void @f0(i32 %x) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: %4 = addrspacecast i32 addrspace(3)* %3 to i32* +; CHECK: %5 = ptrtoint i32* %4 to i64 +; CHECK: %6 = add i64 %5, %5 +; CHECK: %7 = inttoptr i64 %6 to i32* +; CHECK: store i32 %x, i32* %7, align 4 +; CHECK: ret void +entry: + store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_within_both_func_and_kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_within_both_func_and_kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + ret void +} + +define amdgpu_kernel void @k0() { +; CHECK-LABEL: entry: +; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1) to i16), i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), align 2 +; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1, i32 0) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0, i32 0) to i32*) to i64)) to i32*), align 4 +; CHECK: %mul = mul i32 %ld, 2 +; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0, i32 0) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1, i32 0) to i32*) to i64)) to i32*), align 4 +; CHECK: call void @f0(i32 0) +; CHECK: ret void +entry: + %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_within_both_func_and_kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + %mul = mul i32 %ld, 2 + store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_within_both_func_and_kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + call void @f0(i32 0) + ret void +}