diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -498,7 +498,7 @@
   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
   const DataLayout &DL = F.getParent()->getDataLayout();
 
-  Info->allocateModuleLDSGlobal(F);
+  Info->allocateKnownAddressLDSGlobal(F);
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -582,7 +582,7 @@
   const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const DataLayout &DL = F.getParent()->getDataLayout();
 
-  Info->allocateModuleLDSGlobal(F);
+  Info->allocateKnownAddressLDSGlobal(F);
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -101,8 +101,18 @@
     return WaveLimiter;
   }
 
-  unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
-  void allocateModuleLDSGlobal(const Function &F);
+  unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
+    return allocateLDSGlobal(DL, GV, DynLDSAlign);
+  }
+  unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV,
+                             Align Trailing);
+
+  void allocateKnownAddressLDSGlobal(const Function &F);
+
+  // A kernel function may have an associated LDS allocation, and a kernel-scope
+  // LDS allocation must have an associated kernel function
+  static const GlobalVariable *
+  getKernelLDSGlobalFromFunction(const Function &F);
 
   Align getDynLDSAlign() const { return DynLDSAlign; }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -48,7 +48,8 @@
 }
 
 unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
-                                                  const GlobalVariable &GV) {
+                                                  const GlobalVariable &GV,
+                                                  Align Trailing) {
   auto Entry = LocalMemoryObjects.insert(std::make_pair(&GV, 0));
   if (!Entry.second)
     return Entry.first->second;
@@ -65,9 +66,8 @@
 
     StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());
 
-    // Update the LDS size considering the padding to align the dynamic shared
-    // memory.
-    LDSSize = alignTo(StaticLDSSize, DynLDSAlign);
+    // Align LDS size to trailing, e.g. for aligning dynamic shared memory
+    LDSSize = alignTo(StaticLDSSize, Trailing);
   } else {
     assert(GV.getAddressSpace() == AMDGPUAS::REGION_ADDRESS &&
            "expected region address space");
@@ -83,21 +83,58 @@
   return Offset;
 }
 
+const GlobalVariable *
+AMDGPUMachineFunction::getKernelLDSGlobalFromFunction(const Function &F) {
+  const Module *M = F.getParent();
+  std::string KernelLDSName = "llvm.amdgcn.kernel.";
+  KernelLDSName += F.getName();
+  KernelLDSName += ".lds";
+  return M->getNamedGlobal(KernelLDSName);
+}
+
 // This kernel calls no functions that require the module lds struct
 static bool canElideModuleLDS(const Function &F) {
   return F.hasFnAttribute("amdgpu-elide-module-lds");
 }
 
-void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Function &F) {
+void AMDGPUMachineFunction::allocateKnownAddressLDSGlobal(const Function &F) {
   const Module *M = F.getParent();
   if (isModuleEntryFunction()) {
+
+    // Pointer values start from zero, memory allocated per-kernel-launch
+    // Variables can be grouped into a module level struct and a struct per
+    // kernel function by AMDGPULowerModuleLDSPass. If that is done, they
+    // are allocated at statically computable addresses here.
+    //
+    // Address 0
+    // {
+    //   llvm.amdgcn.module.lds
+    // }
+    // alignment padding
+    // {
+    //   llvm.amdgcn.kernel.some-name.lds
+    // }
+    // dynamic lds alignment padding
+
     const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds");
+    const GlobalVariable *KV = getKernelLDSGlobalFromFunction(F);
+
+    // Avoid allocating dynamic lds padding between these variables
+    Align ModuleLDSTrailing = KV ? Align() : getDynLDSAlign();
+
     if (GV && !canElideModuleLDS(F)) {
-      unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);
+      unsigned Offset =
+          allocateLDSGlobal(M->getDataLayout(), *GV, ModuleLDSTrailing);
       (void)Offset;
       assert(Offset == 0 &&
              "Module LDS expected to be allocated before other LDS");
     }
+
+    if (KV) {
+      unsigned Offset =
+          allocateLDSGlobal(M->getDataLayout(), *KV, getDynLDSAlign());
+      (void)Offset;
+    }
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2297,7 +2297,7 @@
     return DAG.getEntryNode();
   }
 
-  Info->allocateModuleLDSGlobal(Fn);
+  Info->allocateKnownAddressLDSGlobal(Fn);
 
   SmallVector<ISD::InputArg, 16> Splits;
   SmallVector<CCValAssign, 16> ArgLocs;