diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -498,7 +498,7 @@ const SITargetLowering &TLI = *getTLI(); const DataLayout &DL = F.getParent()->getDataLayout(); - Info->allocateModuleLDSGlobal(F); + Info->allocateKnownAddressLDSGlobal(F); SmallVector ArgLocs; CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); @@ -582,7 +582,7 @@ const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); const DataLayout &DL = F.getParent()->getDataLayout(); - Info->allocateModuleLDSGlobal(F); + Info->allocateKnownAddressLDSGlobal(F); SmallVector ArgLocs; CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -101,8 +101,18 @@ return WaveLimiter; } - unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV); - void allocateModuleLDSGlobal(const Function &F); + unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) { + return allocateLDSGlobal(DL, GV, DynLDSAlign); + } + unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV, + Align Trailing); + + void allocateKnownAddressLDSGlobal(const Function &F); + + // A kernel function may have an associated LDS allocation, and a kernel-scope + // LDS allocation must have an associated kernel function + static const GlobalVariable * + getKernelLDSGlobalFromFunction(const Function &F); Align getDynLDSAlign() const { return DynLDSAlign; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -48,7 +48,8 @@ } unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, - const GlobalVariable &GV) { + const GlobalVariable &GV, + Align Trailing) { auto Entry = LocalMemoryObjects.insert(std::make_pair(&GV, 0)); if (!Entry.second) return Entry.first->second; @@ -65,9 +66,8 @@ StaticLDSSize += DL.getTypeAllocSize(GV.getValueType()); - // Update the LDS size considering the padding to align the dynamic shared - // memory. - LDSSize = alignTo(StaticLDSSize, DynLDSAlign); + // Align LDS size to trailing, e.g. for aligning dynamic shared memory + LDSSize = alignTo(StaticLDSSize, Trailing); } else { assert(GV.getAddressSpace() == AMDGPUAS::REGION_ADDRESS && "expected region address space"); @@ -83,21 +83,58 @@ return Offset; } +const GlobalVariable * +AMDGPUMachineFunction::getKernelLDSGlobalFromFunction(const Function &F) { + const Module *M = F.getParent(); + std::string KernelLDSName = "llvm.amdgcn.kernel."; + KernelLDSName += F.getName(); + KernelLDSName += ".lds"; + return M->getNamedGlobal(KernelLDSName); +} + // This kernel calls no functions that require the module lds struct static bool canElideModuleLDS(const Function &F) { return F.hasFnAttribute("amdgpu-elide-module-lds"); } -void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Function &F) { +void AMDGPUMachineFunction::allocateKnownAddressLDSGlobal(const Function &F) { const Module *M = F.getParent(); if (isModuleEntryFunction()) { + + // Pointer values start from zero, memory allocated per-kernel-launch + // Variables can be grouped into a module level struct and a struct per + // kernel function by AMDGPULowerModuleLDSPass. If that is done, they + // are allocated at statically computable addresses here. + // + // Address 0 + // { + // llvm.amdgcn.module.lds + // } + // alignment padding + // { + // llvm.amdgcn.kernel.some-name.lds + // } + // dynamic lds alignment padding + const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds"); + const GlobalVariable *KV = getKernelLDSGlobalFromFunction(F); + + // Avoid allocating dynamic lds padding between these variables + Align ModuleLDSTrailing = KV ? Align() : getDynLDSAlign(); + if (GV && !canElideModuleLDS(F)) { - unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV); + unsigned Offset = + allocateLDSGlobal(M->getDataLayout(), *GV, ModuleLDSTrailing); (void)Offset; assert(Offset == 0 && "Module LDS expected to be allocated before other LDS"); } + + if (KV) { + unsigned Offset = + allocateLDSGlobal(M->getDataLayout(), *KV, getDynLDSAlign()); + (void)Offset; + } } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2297,7 +2297,7 @@ return DAG.getEntryNode(); } - Info->allocateModuleLDSGlobal(Fn); + Info->allocateKnownAddressLDSGlobal(Fn); SmallVector Splits; SmallVector ArgLocs;