diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -512,7 +512,7 @@ const SITargetLowering &TLI = *getTLI(); const DataLayout &DL = F.getParent()->getDataLayout(); - Info->allocateKnownAddressLDSGlobal(F); + Info->allocateKnownAddressLDSGlobal(MF.getFunction().getParent(), F); SmallVector ArgLocs; CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); @@ -596,7 +596,7 @@ const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); const DataLayout &DL = F.getParent()->getDataLayout(); - Info->allocateKnownAddressLDSGlobal(F); + Info->allocateKnownAddressLDSGlobal(MF.getFunction().getParent(), F); SmallVector ArgLocs; CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2543,7 +2543,8 @@ // allocated ones. They all share the same offset. if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { // Adjust alignment for that dynamic shared memory array. - MFI->setDynLDSAlign(B.getDataLayout(), *cast(GV)); + Function &F = MF.getFunction(); + MFI->setDynLDSAlign(F.getParent(), F, *cast(GV)); LLT S32 = LLT::scalar(32); auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -183,6 +183,18 @@ return AMDGPU::isKernel(F->getCallingConv()); } +bool isDynamicLDS(const GlobalVariable *GV) { + if (AMDGPU::isLDSVariableToLower(*GV)) { + if (!GV->hasInitializer()) { + // addrspace(3) without initializer implies cuda/hip extern __shared__ + // the semantics for such a variable appears to be that all extern + // __shared__ variables alias one another + return true; + } + } + return false; +} + class AMDGPULowerModuleLDS : public ModulePass { static void @@ -217,6 +229,8 @@ // llvm.donothing that takes a pointer to the instance and is lowered to a // no-op after LDS is allocated, but that is not presently necessary. + // The intrinsic is eliminated shortly before instruction selection. + LLVMContext &Ctx = Func->getContext(); Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI()); @@ -519,28 +533,6 @@ IRBuilder<> Builder(Ctx); Type *I32 = Type::getInt32Ty(Ctx); - // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which - // lowers to a read from a live in register. Emit it once in the entry - // block to spare deduplicating it later. - - DenseMap tableKernelIndexCache; - auto getTableKernelIndex = [&](Function *F) -> Value * { - if (tableKernelIndexCache.count(F) == 0) { - LLVMContext &Ctx = M.getContext(); - FunctionType *FTy = FunctionType::get(Type::getInt32Ty(Ctx), {}); - Function *Decl = - Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {}); - - BasicBlock::iterator it = - F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); - Instruction &i = *it; - Builder.SetInsertPoint(&i); - - tableKernelIndexCache[F] = Builder.CreateCall(FTy, Decl, {}); - } - - return tableKernelIndexCache[F]; - }; for (size_t Index = 0; Index < ModuleScopeVariables.size(); Index++) { auto *GV = ModuleScopeVariables[Index]; @@ -550,7 +542,8 @@ if (!I) continue; - Value *tableKernelIndex = getTableKernelIndex(I->getFunction()); + Value *tableKernelIndex = + getTableLookupKernelIndex(M, I->getFunction()); // So if the phi uses this value multiple times, what does this look // like? @@ -658,6 +651,29 @@ return MostUsed.GV; } + DenseMap tableKernelIndexCache; + Value *getTableLookupKernelIndex(Module &M, Function *F) { + // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which + // lowers to a read from a live in register. Emit it once in the entry + // block to spare deduplicating it later. + if (tableKernelIndexCache.count(F) == 0) { + LLVMContext &Ctx = M.getContext(); + IRBuilder<> Builder(Ctx); + FunctionType *FTy = FunctionType::get(Type::getInt32Ty(Ctx), {}); + Function *Decl = + Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {}); + + BasicBlock::iterator it = + F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); + Instruction &i = *it; + Builder.SetInsertPoint(&i); + + tableKernelIndexCache[F] = Builder.CreateCall(FTy, Decl, {}); + } + + return tableKernelIndexCache[F]; + } + bool runOnModule(Module &M) override { LLVMContext &Ctx = M.getContext(); CallGraph CG = CallGraph(M); @@ -685,6 +701,7 @@ DenseSet ModuleScopeVariables; DenseSet TableLookupVariables; DenseSet KernelAccessVariables; + DenseSet DynamicVariables; { GlobalVariable *HybridModuleRoot = @@ -708,6 +725,11 @@ assert(AMDGPU::isLDSVariableToLower(*GV)); assert(K.second.size() != 0); + if (isDynamicLDS(GV)) { + DynamicVariables.insert(GV); + continue; + } + switch (LoweringKindLoc) { case LoweringKind::module: ModuleScopeVariables.insert(GV); @@ -744,9 +766,10 @@ } assert(ModuleScopeVariables.size() + TableLookupVariables.size() + - KernelAccessVariables.size() == + KernelAccessVariables.size() + DynamicVariables.size() == LDSToKernelsThatNeedToAccessItIndirectly.size()); - } // Variables have now been partitioned into the three lowering strategies. + } // Variables have now been partitioned into the distinct lowering + // strategies. // If the kernel accesses a variable that is going to be stored in the // module instance through a call then that kernel needs to allocate the @@ -757,6 +780,9 @@ DenseSet KernelsThatAllocateTableLDS = kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, TableLookupVariables); + DenseSet KernelsThatAllocateDynamicLDS = + kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, + DynamicVariables); if (!ModuleScopeVariables.empty()) { LDSVariableReplacement ModuleScopeReplacement = @@ -813,7 +839,8 @@ } } - // Create a struct for each kernel for the non-module-scope variables + // Create a struct for each kernel for the non-module-scope, non-dynamic + // variables DenseMap KernelToReplacement; for (Function &Func : M.functions()) { if (Func.isDeclaration() || !isKernelLDS(&Func)) @@ -821,10 +848,12 @@ DenseSet KernelUsedVariables; for (auto &v : LDSUsesInfo.direct_access[&Func]) { - KernelUsedVariables.insert(v); + if (!isDynamicLDS(v)) + KernelUsedVariables.insert(v); } for (auto &v : LDSUsesInfo.indirect_access[&Func]) { - KernelUsedVariables.insert(v); + if (!isDynamicLDS(v)) + KernelUsedVariables.insert(v); } // Variables allocated in module lds must all resolve to that struct, @@ -883,44 +912,56 @@ }); } - if (!KernelsThatAllocateTableLDS.empty()) { - // Collect the kernels that allocate table lookup LDS - std::vector OrderedKernels; - { - for (Function &Func : M.functions()) { - if (Func.isDeclaration()) - continue; - if (!isKernelLDS(&Func)) - continue; + // Collect kernels that allocate table lookup LDS or need to allocate + // dynamic LDS + std::vector OrderedKernels; + if (!KernelsThatAllocateTableLDS.empty() || + !KernelsThatAllocateDynamicLDS.empty()) { - if (KernelsThatAllocateTableLDS.contains(&Func)) { - assert(Func.hasName()); // else fatal error earlier - OrderedKernels.push_back(&Func); - } + for (Function &Func : M.functions()) { + if (Func.isDeclaration()) + continue; + if (!isKernelLDS(&Func)) + continue; + + if (KernelsThatAllocateTableLDS.contains(&Func) || + KernelsThatAllocateDynamicLDS.contains(&Func)) { + assert(Func.hasName()); // else fatal error earlier (todo: make this + // true) + OrderedKernels.push_back(&Func); } + } - // Put them in an arbitrary but reproducible order - llvm::sort(OrderedKernels.begin(), OrderedKernels.end(), - [](const Function *lhs, const Function *rhs) -> bool { - return lhs->getName() < rhs->getName(); - }); + // Put them in an arbitrary but reproducible order + llvm::sort(OrderedKernels.begin(), OrderedKernels.end(), + [](const Function *lhs, const Function *rhs) -> bool { + return lhs->getName() < rhs->getName(); + }); - // Annotate the kernels with their order in this vector + // Annotate the kernels with their order in this vector + LLVMContext &Ctx = M.getContext(); + IRBuilder<> Builder(Ctx); + + if (OrderedKernels.size() > UINT32_MAX) { + // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU + report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels"); + } + + for (size_t i = 0; i < OrderedKernels.size(); i++) { + Metadata *AttrMDArgs[1] = { + ConstantAsMetadata::get(Builder.getInt32(i)), + }; + OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id", + MDNode::get(Ctx, AttrMDArgs)); + } + } + + if (!KernelsThatAllocateTableLDS.empty()) { + { LLVMContext &Ctx = M.getContext(); IRBuilder<> Builder(Ctx); - if (OrderedKernels.size() > UINT32_MAX) { - // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU - report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels"); - } - for (size_t i = 0; i < OrderedKernels.size(); i++) { - Metadata *AttrMDArgs[1] = { - ConstantAsMetadata::get(Builder.getInt32(i)), - }; - OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id", - MDNode::get(Ctx, AttrMDArgs)); - markUsedByKernel(Builder, OrderedKernels[i], KernelToReplacement[OrderedKernels[i]].SGV); } @@ -936,15 +977,106 @@ return lhs->getName() < rhs->getName(); }); + // element[k] of lookup table is an array of i32 of length + // TableLookupVariablesOrdered.size() containing the address of the ith + // variable at index i + GlobalVariable *LookupTable = buildLookupTable( M, TableLookupVariablesOrdered, OrderedKernels, KernelToReplacement); replaceUsesInInstructionsWithTableLookup(M, TableLookupVariablesOrdered, LookupTable); } + if (!KernelsThatAllocateDynamicLDS.empty()) { + LLVMContext &Ctx = M.getContext(); + IRBuilder<> Builder(Ctx); + const DataLayout &DL = M.getDataLayout(); + Type *I32 = Type::getInt32Ty(Ctx); + + std::vector newDynamicLDS; + + for (auto &func : OrderedKernels) { + + if (KernelsThatAllocateDynamicLDS.contains(func)) { + + Align MaxDynamicAlignment(1); + fprintf(stdout, "kernel %s\n", func->getName().str().c_str()); + for (GlobalVariable *GV : LDSUsesInfo.indirect_access[func]) { + if (!DynamicVariables.contains(GV)) + continue; + + fprintf(stdout, " uses dynamic variable\n "); + GV->dump(); + + MaxDynamicAlignment = + std::max(MaxDynamicAlignment, AMDGPU::getAlign(DL, GV)); + } + + auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0); + std::string VarName = + Twine("llvm.amdgcn." + func->getName() + ".dynlds").str(); + GlobalVariable *N = new GlobalVariable( + M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr, + VarName, nullptr, GlobalValue::NotThreadLocal, + AMDGPUAS::LOCAL_ADDRESS, false); + N->setAlignment(MaxDynamicAlignment); + + markUsedByKernel(Builder, func, N); + + assert(isDynamicLDS(N)); + + auto GEP = ConstantExpr::getGetElementPtr( + emptyCharArray, N, ConstantInt::get(I32, 0), true); + newDynamicLDS.push_back(ConstantExpr::getPtrToInt(GEP, I32)); + } else { + newDynamicLDS.push_back(PoisonValue::get(I32)); + } + } + + ArrayType *t = ArrayType::get(I32, OrderedKernels.size()); + Constant *init = ConstantArray::get(t, newDynamicLDS); + GlobalVariable *table = new GlobalVariable( + M, t, true, GlobalValue::InternalLinkage, init, + "llvm.amdgcn.dynlds.offset.table", nullptr, + GlobalValue::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS); + + for (GlobalVariable *GV : DynamicVariables) { + for (Use &U : make_early_inc_range(GV->uses())) { + auto *I = dyn_cast(U.getUser()); + if (!I) + continue; + if (isKernelLDS(I->getFunction())) + continue; + + Value *tableKernelIndex = + getTableLookupKernelIndex(M, I->getFunction()); + if (auto *Phi = dyn_cast(I)) { + BasicBlock *BB = Phi->getIncomingBlock(U); + Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt()))); + } else { + Builder.SetInsertPoint(I); + } + + Value *GEPIdx[2] = { + ConstantInt::get(I32, 0), + tableKernelIndex, + }; + + Value *Address = Builder.CreateInBoundsGEP( + table->getValueType(), table, GEPIdx, GV->getName()); + + Value *loaded = Builder.CreateLoad(I32, Address); + + Value *replacement = + Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName()); + + U.set(replacement); + } + } + } + for (auto &GV : make_early_inc_range(M.globals())) if (AMDGPU::isLDSVariableToLower(GV)) { - // probably want to remove from used lists GV.removeDeadConstantUsers(); if (GV.use_empty()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -13,6 +13,7 @@ // #include "AMDGPUMCInstLower.h" +#include "AMDGPU.h" #include "AMDGPUAsmPrinter.h" #include "AMDGPUMachineFunction.h" #include "AMDGPUTargetMachine.h" @@ -168,12 +169,17 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) { // Intercept LDS variables with known addresses - if (const GlobalVariable *GV = dyn_cast(CV)) { - if (AMDGPUMachineFunction::isKnownAddressLDSGlobal(*GV)) { - unsigned offset = - AMDGPUMachineFunction::calculateKnownAddressOfLDSGlobal(*GV); - Constant *C = ConstantInt::get(CV->getContext(), APInt(32, offset)); - return AsmPrinter::lowerConstant(C); + if (const GlobalVariable *GV = dyn_cast(CV)) { + if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + auto MD = GV->getMetadata(LLVMContext::MD_absolute_symbol); + if (MD && MD->getNumOperands() == 1) { + if (ConstantInt *KnownSize = + mdconst::extract(MD->getOperand(0))) { + if (KnownSize->getZExtValue() <= UINT32_MAX) { + return AsmPrinter::lowerConstant(KnownSize); + } + } + } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -104,7 +104,7 @@ unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV, Align Trailing); - void allocateKnownAddressLDSGlobal(const Function &F); + void allocateKnownAddressLDSGlobal(Module *M, const Function &F); // A kernel function may have an associated LDS allocation, and a kernel-scope // LDS allocation must have an associated kernel function @@ -112,18 +112,16 @@ // LDS allocation should have an associated kernel function static const Function * getKernelLDSFunctionFromGlobal(const GlobalVariable &GV); - static const GlobalVariable * - getKernelLDSGlobalFromFunction(const Function &F); - - // Module or kernel scope LDS variable - static bool isKnownAddressLDSGlobal(const GlobalVariable &GV); - static unsigned calculateKnownAddressOfLDSGlobal(const GlobalVariable &GV); + static GlobalVariable *getKernelLDSGlobalFromFunction(Module *M, + const Function &F); + static GlobalVariable *getKernelDynLDSGlobalFromFunction(Module *M, + const Function &F); static std::optional getLDSKernelIdMetadata(const Function &F); Align getDynLDSAlign() const { return DynLDSAlign; } - void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV); + void setDynLDSAlign(Module *M, const Function &F, const GlobalVariable &GV); }; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -12,6 +12,8 @@ #include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Metadata.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; @@ -89,12 +91,6 @@ static constexpr StringLiteral ModuleLDSName = "llvm.amdgcn.module.lds"; -bool AMDGPUMachineFunction::isKnownAddressLDSGlobal(const GlobalVariable &GV) { - auto name = GV.getName(); - return (name == ModuleLDSName) || - (name.startswith("llvm.amdgcn.kernel.") && name.endswith(".lds")); -} - const Function *AMDGPUMachineFunction::getKernelLDSFunctionFromGlobal( const GlobalVariable &GV) { const Module &M = *GV.getParent(); @@ -105,53 +101,42 @@ return nullptr; } -const GlobalVariable * -AMDGPUMachineFunction::getKernelLDSGlobalFromFunction(const Function &F) { - const Module *M = F.getParent(); +GlobalVariable * +AMDGPUMachineFunction::getKernelLDSGlobalFromFunction(Module *M, + const Function &F) { std::string KernelLDSName = "llvm.amdgcn.kernel."; KernelLDSName += F.getName(); KernelLDSName += ".lds"; return M->getNamedGlobal(KernelLDSName); } +GlobalVariable * +AMDGPUMachineFunction::getKernelDynLDSGlobalFromFunction(Module *M, + const Function &F) { + std::string KernelDynLDSName = "llvm.amdgcn."; + KernelDynLDSName += F.getName(); + KernelDynLDSName += ".dynlds"; + return M->getNamedGlobal(KernelDynLDSName); +} + // This kernel calls no functions that require the module lds struct static bool canElideModuleLDS(const Function &F) { return F.hasFnAttribute("amdgpu-elide-module-lds"); } -unsigned AMDGPUMachineFunction::calculateKnownAddressOfLDSGlobal( - const GlobalVariable &GV) { - // module.lds, then alignment padding, then kernel.lds, then other variables - // if any - - assert(isKnownAddressLDSGlobal(GV)); - unsigned Offset = 0; - - if (GV.getName() == ModuleLDSName) { - return 0; - } - - const Module *M = GV.getParent(); - const DataLayout &DL = M->getDataLayout(); - - const GlobalVariable *GVM = M->getNamedGlobal(ModuleLDSName); - const Function *f = getKernelLDSFunctionFromGlobal(GV); - - // Account for module.lds if allocated for this function - if (GVM && f && !canElideModuleLDS(*f)) { - // allocator aligns this to var align, but it's zero to begin with - Offset += DL.getTypeAllocSize(GVM->getValueType()); - } - - // No dynamic LDS alignment done by allocateModuleLDSGlobal - Offset = alignTo( - Offset, DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType())); - - return Offset; +static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, + uint32_t Address) { + // Write the specified address into metadata where it can be retrieved by the + // assembler + LLVMContext &Ctx = M->getContext(); + auto Type = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); + GV->setMetadata(LLVMContext::MD_absolute_symbol, + MDNode::get(Ctx, ConstantAsMetadata::get( + ConstantInt::get(Type, Address)))); } -void AMDGPUMachineFunction::allocateKnownAddressLDSGlobal(const Function &F) { - const Module *M = F.getParent(); +void AMDGPUMachineFunction::allocateKnownAddressLDSGlobal(Module *M, + const Function &F) { // This function is called before allocating any other LDS so that it can // reliably put values at known addresses. Consequently, dynamic LDS, if @@ -176,25 +161,24 @@ // } // other variables, e.g. dynamic lds, allocated after this call - const GlobalVariable *GV = M->getNamedGlobal(ModuleLDSName); - const GlobalVariable *KV = getKernelLDSGlobalFromFunction(F); + GlobalVariable *GV = M->getNamedGlobal(ModuleLDSName); + GlobalVariable *KV = getKernelLDSGlobalFromFunction(M, F); + GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(M, F); if (GV && !canElideModuleLDS(F)) { - assert(isKnownAddressLDSGlobal(*GV)); unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV, Align()); - (void)Offset; - assert(Offset == calculateKnownAddressOfLDSGlobal(*GV) && - "Module LDS expected to be allocated before other LDS"); + recordLDSAbsoluteAddress(M, GV, Offset); } if (KV) { // The per-kernel offset is deterministic because it is allocated // before any other non-module LDS variables. - assert(isKnownAddressLDSGlobal(*KV)); unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *KV, Align()); - (void)Offset; - assert(Offset == calculateKnownAddressOfLDSGlobal(*KV) && - "Kernel LDS expected to be immediately after module LDS"); + recordLDSAbsoluteAddress(M, KV, Offset); + } + + if (Dyn) { + setDynLDSAlign(M, F, *Dyn); } } } @@ -214,8 +198,9 @@ return {}; } -void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL, +void AMDGPUMachineFunction::setDynLDSAlign(Module *M, const Function &F, const GlobalVariable &GV) { + const DataLayout &DL = M->getDataLayout(); assert(DL.getTypeAllocSize(GV.getValueType()).isZero()); Align Alignment = @@ -225,4 +210,13 @@ LDSSize = alignTo(StaticLDSSize, Alignment); DynLDSAlign = Alignment; + + // If there is a dynamic LDS variable associated with this function, update it + // whenever changing the address of dynamic LDS, aka when calling this + // function + GlobalVariable *Dyn = + AMDGPUMachineFunction::getKernelDynLDSGlobalFromFunction(M, F); + if (Dyn) { + recordLDSAbsoluteAddress(M, Dyn, LDSSize); + } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2412,7 +2412,7 @@ return DAG.getEntryNode(); } - Info->allocateKnownAddressLDSGlobal(Fn); + Info->allocateKnownAddressLDSGlobal(MF.getFunction().getParent(), Fn); SmallVector Splits; SmallVector ArgLocs; @@ -6139,7 +6139,8 @@ if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) { assert(PtrVT == MVT::i32 && "32-bit pointer is expected."); // Adjust alignment for that dynamic shared memory array. - MFI->setDynLDSAlign(DAG.getDataLayout(), *cast(GV)); + Function &F = DAG.getMachineFunction().getFunction(); + MFI->setDynLDSAlign(F.getParent(), F, *cast(GV)); return SDValue( DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp @@ -79,24 +79,23 @@ if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { return false; } + if (GV.isConstant()) { + // A constant undef variable can't be written to, and any load is + // undef, so it should be eliminated by the optimizer. It could be + // dropped by the back end if not. This pass skips over it. + return false; + } if (!GV.hasInitializer()) { // addrspace(3) without initializer implies cuda/hip extern __shared__ // the semantics for such a variable appears to be that all extern - // __shared__ variables alias one another, in which case this transform - // is not required - return false; + // __shared__ variables alias one another. This hits different handling. + return true; } if (!isa(GV.getInitializer())) { // Initializers are unimplemented for LDS address space. // Leave such variables in place for consistent error reporting. return false; } - if (GV.isConstant()) { - // A constant undef variable can't be written to, and any load is - // undef, so it should be eliminated by the optimizer. It could be - // dropped by the back end if not. This pass skips over it. - return false; - } return true; } diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll @@ -18,10 +18,6 @@ @const_undef = addrspace(3) constant i32 undef @const_with_init = addrspace(3) constant i64 8 -; External and constant are both left to the optimizer / error diagnostics -; CHECK: @extern = external addrspace(3) global i32 -@extern = external addrspace(3) global i32 - ; Use of an addrspace(3) variable with an initializer is skipped, ; so as to preserve the unimplemented error from llc ; CHECK: @with_init = addrspace(3) global i64 0 @@ -47,14 +43,12 @@ ; CHECK: %c0 = load i32, ptr addrspace(3) @const_undef, align 4 ; CHECK: %c1 = load i64, ptr addrspace(3) @const_with_init, align 4 ; CHECK: %v0 = atomicrmw add ptr addrspace(3) @with_init, i64 1 seq_cst -; CHECK: %v1 = cmpxchg ptr addrspace(3) @extern, i32 4, i32 %c0 acq_rel monotonic -; CHECK: %v2 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic +; CHECK: %v1 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic define void @use_variables() { %c0 = load i32, ptr addrspace(3) @const_undef, align 4 %c1 = load i64, ptr addrspace(3) @const_with_init, align 4 %v0 = atomicrmw add ptr addrspace(3) @with_init, i64 1 seq_cst - %v1 = cmpxchg ptr addrspace(3) @extern, i32 4, i32 %c0 acq_rel monotonic - %v2 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic + %v1 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic ret void }