diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -512,7 +512,7 @@
   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
   const DataLayout &DL = F.getParent()->getDataLayout();
 
-  Info->allocateKnownAddressLDSGlobal(F);
+  Info->allocateKnownAddressLDSGlobal(MF.getFunction().getParent(), F);
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -596,7 +596,7 @@
   const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const DataLayout &DL = F.getParent()->getDataLayout();
 
-  Info->allocateKnownAddressLDSGlobal(F);
+  Info->allocateKnownAddressLDSGlobal(MF.getFunction().getParent(), F);
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2543,7 +2543,8 @@
       // allocated ones. They all share the same offset.
       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
         // Adjust alignment for that dynamic shared memory array.
-        MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
+        Function &F = MF.getFunction();
+        MFI->setDynLDSAlign(F.getParent(), F, *cast<GlobalVariable>(GV));
         LLT S32 = LLT::scalar(32);
         auto Sz =
             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -183,6 +183,18 @@
   return AMDGPU::isKernel(F->getCallingConv());
 }
 
+bool isDynamicLDS(const GlobalVariable *GV) {
+  if (AMDGPU::isLDSVariableToLower(*GV)) {
+    if (!GV->hasInitializer()) {
+      // addrspace(3) without initializer implies cuda/hip extern __shared__
+      // the semantics for such a variable appears to be that all extern
+      // __shared__ variables alias one another
+      return true;
+    }
+  }
+  return false;
+}
+
 class AMDGPULowerModuleLDS : public ModulePass {
 
   static void
@@ -217,6 +229,8 @@
     // llvm.donothing that takes a pointer to the instance and is lowered to a
     // no-op after LDS is allocated, but that is not presently necessary.
 
+    // The intrinsic is eliminated shortly before instruction selection.
+
     LLVMContext &Ctx = Func->getContext();
 
     Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI());
@@ -519,28 +533,6 @@
     IRBuilder<> Builder(Ctx);
     Type *I32 = Type::getInt32Ty(Ctx);
 
-    // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which
-    // lowers to a read from a live in register. Emit it once in the entry
-    // block to spare deduplicating it later.
-
-    DenseMap<Function *, Value *> tableKernelIndexCache;
-    auto getTableKernelIndex = [&](Function *F) -> Value * {
-      if (tableKernelIndexCache.count(F) == 0) {
-        LLVMContext &Ctx = M.getContext();
-        FunctionType *FTy = FunctionType::get(Type::getInt32Ty(Ctx), {});
-        Function *Decl =
-            Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {});
-
-        BasicBlock::iterator it =
-            F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
-        Instruction &i = *it;
-        Builder.SetInsertPoint(&i);
-
-        tableKernelIndexCache[F] = Builder.CreateCall(FTy, Decl, {});
-      }
-
-      return tableKernelIndexCache[F];
-    };
 
     for (size_t Index = 0; Index < ModuleScopeVariables.size(); Index++) {
       auto *GV = ModuleScopeVariables[Index];
@@ -550,7 +542,8 @@
         if (!I)
           continue;
 
-        Value *tableKernelIndex = getTableKernelIndex(I->getFunction());
+        Value *tableKernelIndex =
+            getTableLookupKernelIndex(M, I->getFunction());
 
         // So if the phi uses this value multiple times, what does this look
         // like?
@@ -658,6 +651,29 @@
     return MostUsed.GV;
   }
 
+  DenseMap<Function *, Value *> tableKernelIndexCache;
+  Value *getTableLookupKernelIndex(Module &M, Function *F) {
+    // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which
+    // lowers to a read from a live in register. Emit it once in the entry
+    // block to spare deduplicating it later.
+    if (tableKernelIndexCache.count(F) == 0) {
+      LLVMContext &Ctx = M.getContext();
+      IRBuilder<> Builder(Ctx);
+      FunctionType *FTy = FunctionType::get(Type::getInt32Ty(Ctx), {});
+      Function *Decl =
+          Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {});
+
+      BasicBlock::iterator it =
+          F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
+      Instruction &i = *it;
+      Builder.SetInsertPoint(&i);
+
+      tableKernelIndexCache[F] = Builder.CreateCall(FTy, Decl, {});
+    }
+
+    return tableKernelIndexCache[F];
+  }
+
   bool runOnModule(Module &M) override {
     LLVMContext &Ctx = M.getContext();
     CallGraph CG = CallGraph(M);
@@ -685,6 +701,7 @@
     DenseSet<GlobalVariable *> ModuleScopeVariables;
     DenseSet<GlobalVariable *> TableLookupVariables;
     DenseSet<GlobalVariable *> KernelAccessVariables;
+    DenseSet<GlobalVariable *> DynamicVariables;
 
     {
       GlobalVariable *HybridModuleRoot =
@@ -708,6 +725,11 @@
         assert(AMDGPU::isLDSVariableToLower(*GV));
         assert(K.second.size() != 0);
 
+        if (isDynamicLDS(GV)) {
+          DynamicVariables.insert(GV);
+          continue;
+        }
+
         switch (LoweringKindLoc) {
         case LoweringKind::module:
           ModuleScopeVariables.insert(GV);
@@ -744,9 +766,10 @@
       }
 
       assert(ModuleScopeVariables.size() + TableLookupVariables.size() +
-                 KernelAccessVariables.size() ==
+                 KernelAccessVariables.size() + DynamicVariables.size() ==
              LDSToKernelsThatNeedToAccessItIndirectly.size());
-    } // Variables have now been partitioned into the three lowering strategies.
+    } // Variables have now been partitioned into the distinct lowering
+      // strategies.
 
     // If the kernel accesses a variable that is going to be stored in the
     // module instance through a call then that kernel needs to allocate the
@@ -757,6 +780,9 @@
     DenseSet<Function *> KernelsThatAllocateTableLDS =
         kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
                                                         TableLookupVariables);
+    DenseSet<Function *> KernelsThatAllocateDynamicLDS =
+        kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
+                                                        DynamicVariables);
 
     if (!ModuleScopeVariables.empty()) {
       LDSVariableReplacement ModuleScopeReplacement =
@@ -813,7 +839,8 @@
       }
     }
 
-    // Create a struct for each kernel for the non-module-scope variables
+    // Create a struct for each kernel for the non-module-scope, non-dynamic
+    // variables
     DenseMap<Function *, LDSVariableReplacement> KernelToReplacement;
     for (Function &Func : M.functions()) {
       if (Func.isDeclaration() || !isKernelLDS(&Func))
@@ -821,10 +848,12 @@
 
       DenseSet<GlobalVariable *> KernelUsedVariables;
       for (auto &v : LDSUsesInfo.direct_access[&Func]) {
-        KernelUsedVariables.insert(v);
+        if (!isDynamicLDS(v))
+          KernelUsedVariables.insert(v);
       }
       for (auto &v : LDSUsesInfo.indirect_access[&Func]) {
-        KernelUsedVariables.insert(v);
+        if (!isDynamicLDS(v))
+          KernelUsedVariables.insert(v);
       }
 
       // Variables allocated in module lds must all resolve to that struct,
@@ -883,44 +912,56 @@
       });
     }
 
-    if (!KernelsThatAllocateTableLDS.empty()) {
-      // Collect the kernels that allocate table lookup LDS
-      std::vector<Function *> OrderedKernels;
-      {
-        for (Function &Func : M.functions()) {
-          if (Func.isDeclaration())
-            continue;
-          if (!isKernelLDS(&Func))
-            continue;
+    // Collect kernels that allocate table lookup LDS or need to allocate
+    // dynamic LDS
+    std::vector<Function *> OrderedKernels;
+    if (!KernelsThatAllocateTableLDS.empty() ||
+        !KernelsThatAllocateDynamicLDS.empty()) {
 
-          if (KernelsThatAllocateTableLDS.contains(&Func)) {
-            assert(Func.hasName()); // else fatal error earlier
-            OrderedKernels.push_back(&Func);
-          }
+      for (Function &Func : M.functions()) {
+        if (Func.isDeclaration())
+          continue;
+        if (!isKernelLDS(&Func))
+          continue;
+
+        if (KernelsThatAllocateTableLDS.contains(&Func) ||
+            KernelsThatAllocateDynamicLDS.contains(&Func)) {
+          assert(Func.hasName()); // else fatal error earlier (todo: make this
+                                  // true)
+          OrderedKernels.push_back(&Func);
         }
+      }
 
-        // Put them in an arbitrary but reproducible order
-        llvm::sort(OrderedKernels.begin(), OrderedKernels.end(),
-                   [](const Function *lhs, const Function *rhs) -> bool {
-                     return lhs->getName() < rhs->getName();
-                   });
+      // Put them in an arbitrary but reproducible order
+      llvm::sort(OrderedKernels.begin(), OrderedKernels.end(),
+                 [](const Function *lhs, const Function *rhs) -> bool {
+                   return lhs->getName() < rhs->getName();
+                 });
 
-        // Annotate the kernels with their order in this vector
+      // Annotate the kernels with their order in this vector
+      LLVMContext &Ctx = M.getContext();
+      IRBuilder<> Builder(Ctx);
+
+      if (OrderedKernels.size() > UINT32_MAX) {
+        // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU
+        report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels");
+      }
+
+      for (size_t i = 0; i < OrderedKernels.size(); i++) {
+        Metadata *AttrMDArgs[1] = {
+            ConstantAsMetadata::get(Builder.getInt32(i)),
+        };
+        OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id",
+                                       MDNode::get(Ctx, AttrMDArgs));
+      }
+    }
+
+    if (!KernelsThatAllocateTableLDS.empty()) {
+      {
         LLVMContext &Ctx = M.getContext();
         IRBuilder<> Builder(Ctx);
 
-        if (OrderedKernels.size() > UINT32_MAX) {
-          // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU
-          report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels");
-        }
-
         for (size_t i = 0; i < OrderedKernels.size(); i++) {
-          Metadata *AttrMDArgs[1] = {
-              ConstantAsMetadata::get(Builder.getInt32(i)),
-          };
-          OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id",
-                                         MDNode::get(Ctx, AttrMDArgs));
-
           markUsedByKernel(Builder, OrderedKernels[i],
                            KernelToReplacement[OrderedKernels[i]].SGV);
         }
@@ -936,15 +977,106 @@
                    return lhs->getName() < rhs->getName();
                  });
 
+      // element[k] of lookup table is an array of i32 of length
+      // TableLookupVariablesOrdered.size() containing the address of the ith
+      // variable at index i
+
       GlobalVariable *LookupTable = buildLookupTable(
           M, TableLookupVariablesOrdered, OrderedKernels, KernelToReplacement);
       replaceUsesInInstructionsWithTableLookup(M, TableLookupVariablesOrdered,
                                                LookupTable);
     }
 
+    if (!KernelsThatAllocateDynamicLDS.empty()) {
+      LLVMContext &Ctx = M.getContext();
+      IRBuilder<> Builder(Ctx);
+      const DataLayout &DL = M.getDataLayout();
+      Type *I32 = Type::getInt32Ty(Ctx);
+
+      std::vector<Constant *> newDynamicLDS;
+
+      for (auto &func : OrderedKernels) {
+
+        if (KernelsThatAllocateDynamicLDS.contains(func)) {
+
+          Align MaxDynamicAlignment(1);
+          fprintf(stdout, "kernel %s\n", func->getName().str().c_str());
+          for (GlobalVariable *GV : LDSUsesInfo.indirect_access[func]) {
+            if (!DynamicVariables.contains(GV))
+              continue;
+
+            fprintf(stdout, "  uses dynamic variable\n  ");
+            GV->dump();
+
+            MaxDynamicAlignment =
+                std::max(MaxDynamicAlignment, AMDGPU::getAlign(DL, GV));
+          }
+
+          auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0);
+          std::string VarName =
+              Twine("llvm.amdgcn." + func->getName() + ".dynlds").str();
+          GlobalVariable *N = new GlobalVariable(
+              M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr,
+              VarName, nullptr, GlobalValue::NotThreadLocal,
+              AMDGPUAS::LOCAL_ADDRESS, false);
+          N->setAlignment(MaxDynamicAlignment);
+
+          markUsedByKernel(Builder, func, N);
+
+          assert(isDynamicLDS(N));
+
+          auto GEP = ConstantExpr::getGetElementPtr(
+              emptyCharArray, N, ConstantInt::get(I32, 0), true);
+          newDynamicLDS.push_back(ConstantExpr::getPtrToInt(GEP, I32));
+        } else {
+          newDynamicLDS.push_back(PoisonValue::get(I32));
+        }
+      }
+
+      ArrayType *t = ArrayType::get(I32, OrderedKernels.size());
+      Constant *init = ConstantArray::get(t, newDynamicLDS);
+      GlobalVariable *table = new GlobalVariable(
+          M, t, true, GlobalValue::InternalLinkage, init,
+          "llvm.amdgcn.dynlds.offset.table", nullptr,
+          GlobalValue::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS);
+
+      for (GlobalVariable *GV : DynamicVariables) {
+        for (Use &U : make_early_inc_range(GV->uses())) {
+          auto *I = dyn_cast<Instruction>(U.getUser());
+          if (!I)
+            continue;
+          if (isKernelLDS(I->getFunction()))
+            continue;
+
+          Value *tableKernelIndex =
+              getTableLookupKernelIndex(M, I->getFunction());
+          if (auto *Phi = dyn_cast<PHINode>(I)) {
+            BasicBlock *BB = Phi->getIncomingBlock(U);
+            Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt())));
+          } else {
+            Builder.SetInsertPoint(I);
+          }
+
+          Value *GEPIdx[2] = {
+              ConstantInt::get(I32, 0),
+              tableKernelIndex,
+          };
+
+          Value *Address = Builder.CreateInBoundsGEP(
+              table->getValueType(), table, GEPIdx, GV->getName());
+
+          Value *loaded = Builder.CreateLoad(I32, Address);
+
+          Value *replacement =
+              Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName());
+
+          U.set(replacement);
+        }
+      }
+    }
+
     for (auto &GV : make_early_inc_range(M.globals()))
       if (AMDGPU::isLDSVariableToLower(GV)) {
-
         // probably want to remove from used lists
         GV.removeDeadConstantUsers();
         if (GV.use_empty())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -13,6 +13,7 @@
 //
 
 #include "AMDGPUMCInstLower.h"
+#include "AMDGPU.h"
 #include "AMDGPUAsmPrinter.h"
 #include "AMDGPUMachineFunction.h"
 #include "AMDGPUTargetMachine.h"
@@ -168,12 +169,17 @@
 const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
 
   // Intercept LDS variables with known addresses
-  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(CV)) {
-    if (AMDGPUMachineFunction::isKnownAddressLDSGlobal(*GV)) {
-      unsigned offset =
-          AMDGPUMachineFunction::calculateKnownAddressOfLDSGlobal(*GV);
-      Constant *C = ConstantInt::get(CV->getContext(), APInt(32, offset));
-      return AsmPrinter::lowerConstant(C);
+  if (const GlobalVariable *GV = dyn_cast<const GlobalVariable>(CV)) {
+    if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+      auto MD = GV->getMetadata(LLVMContext::MD_absolute_symbol);
+      if (MD && MD->getNumOperands() == 1) {
+        if (ConstantInt *KnownSize =
+                mdconst::extract<ConstantInt>(MD->getOperand(0))) {
+          if (KnownSize->getZExtValue() <= UINT32_MAX) {
+            return AsmPrinter::lowerConstant(KnownSize);
+          }
+        }
+      }
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -104,7 +104,7 @@
   unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV,
                              Align Trailing);
 
-  void allocateKnownAddressLDSGlobal(const Function &F);
+  void allocateKnownAddressLDSGlobal(Module *M, const Function &F);
 
   // A kernel function may have an associated LDS allocation, and a kernel-scope
   // LDS allocation must have an associated kernel function
@@ -112,18 +112,16 @@
   // LDS allocation should have an associated kernel function
   static const Function *
   getKernelLDSFunctionFromGlobal(const GlobalVariable &GV);
-  static const GlobalVariable *
-  getKernelLDSGlobalFromFunction(const Function &F);
-
-  // Module or kernel scope LDS variable
-  static bool isKnownAddressLDSGlobal(const GlobalVariable &GV);
-  static unsigned calculateKnownAddressOfLDSGlobal(const GlobalVariable &GV);
+  static GlobalVariable *getKernelLDSGlobalFromFunction(Module *M,
+                                                        const Function &F);
+  static GlobalVariable *getKernelDynLDSGlobalFromFunction(Module *M,
+                                                           const Function &F);
 
   static std::optional<uint32_t> getLDSKernelIdMetadata(const Function &F);
 
   Align getDynLDSAlign() const { return DynLDSAlign; }
 
-  void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV);
+  void setDynLDSAlign(Module *M, const Function &F, const GlobalVariable &GV);
 };
 
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -12,6 +12,8 @@
 #include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
@@ -89,12 +91,6 @@
 
 static constexpr StringLiteral ModuleLDSName = "llvm.amdgcn.module.lds";
 
-bool AMDGPUMachineFunction::isKnownAddressLDSGlobal(const GlobalVariable &GV) {
-  auto name = GV.getName();
-  return (name == ModuleLDSName) ||
-         (name.startswith("llvm.amdgcn.kernel.") && name.endswith(".lds"));
-}
-
 const Function *AMDGPUMachineFunction::getKernelLDSFunctionFromGlobal(
     const GlobalVariable &GV) {
   const Module &M = *GV.getParent();
@@ -105,53 +101,42 @@
   return nullptr;
 }
 
-const GlobalVariable *
-AMDGPUMachineFunction::getKernelLDSGlobalFromFunction(const Function &F) {
-  const Module *M = F.getParent();
+GlobalVariable *
+AMDGPUMachineFunction::getKernelLDSGlobalFromFunction(Module *M,
+                                                      const Function &F) {
   std::string KernelLDSName = "llvm.amdgcn.kernel.";
   KernelLDSName += F.getName();
   KernelLDSName += ".lds";
   return M->getNamedGlobal(KernelLDSName);
 }
 
+GlobalVariable *
+AMDGPUMachineFunction::getKernelDynLDSGlobalFromFunction(Module *M,
+                                                         const Function &F) {
+  std::string KernelDynLDSName = "llvm.amdgcn.";
+  KernelDynLDSName += F.getName();
+  KernelDynLDSName += ".dynlds";
+  return M->getNamedGlobal(KernelDynLDSName);
+}
+
 // This kernel calls no functions that require the module lds struct
 static bool canElideModuleLDS(const Function &F) {
   return F.hasFnAttribute("amdgpu-elide-module-lds");
 }
 
-unsigned AMDGPUMachineFunction::calculateKnownAddressOfLDSGlobal(
-    const GlobalVariable &GV) {
-  // module.lds, then alignment padding, then kernel.lds, then other variables
-  // if any
-
-  assert(isKnownAddressLDSGlobal(GV));
-  unsigned Offset = 0;
-
-  if (GV.getName() == ModuleLDSName) {
-    return 0;
-  }
-
-  const Module *M = GV.getParent();
-  const DataLayout &DL = M->getDataLayout();
-
-  const GlobalVariable *GVM = M->getNamedGlobal(ModuleLDSName);
-  const Function *f = getKernelLDSFunctionFromGlobal(GV);
-
-  // Account for module.lds if allocated for this function
-  if (GVM && f && !canElideModuleLDS(*f)) {
-    // allocator aligns this to var align, but it's zero to begin with
-    Offset += DL.getTypeAllocSize(GVM->getValueType());
-  }
-
-  // No dynamic LDS alignment done by allocateModuleLDSGlobal
-  Offset = alignTo(
-      Offset, DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType()));
-
-  return Offset;
+static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
+                                     uint32_t Address) {
+  // Write the specified address into metadata where it can be retrieved by the
+  // assembler
+  LLVMContext &Ctx = M->getContext();
+  auto Type = M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
+  GV->setMetadata(LLVMContext::MD_absolute_symbol,
+                  MDNode::get(Ctx, ConstantAsMetadata::get(
+                                       ConstantInt::get(Type, Address))));
 }
 
-void AMDGPUMachineFunction::allocateKnownAddressLDSGlobal(const Function &F) {
-  const Module *M = F.getParent();
+void AMDGPUMachineFunction::allocateKnownAddressLDSGlobal(Module *M,
+                                                          const Function &F) {
 
   // This function is called before allocating any other LDS so that it can
   // reliably put values at known addresses. Consequently, dynamic LDS, if
@@ -176,25 +161,24 @@
     // }
     // other variables, e.g. dynamic lds, allocated after this call
 
-    const GlobalVariable *GV = M->getNamedGlobal(ModuleLDSName);
-    const GlobalVariable *KV = getKernelLDSGlobalFromFunction(F);
+    GlobalVariable *GV = M->getNamedGlobal(ModuleLDSName);
+    GlobalVariable *KV = getKernelLDSGlobalFromFunction(M, F);
+    GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(M, F);
 
     if (GV && !canElideModuleLDS(F)) {
-      assert(isKnownAddressLDSGlobal(*GV));
       unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV, Align());
-      (void)Offset;
-      assert(Offset == calculateKnownAddressOfLDSGlobal(*GV) &&
-             "Module LDS expected to be allocated before other LDS");
+      recordLDSAbsoluteAddress(M, GV, Offset);
     }
 
     if (KV) {
       // The per-kernel offset is deterministic because it is allocated
       // before any other non-module LDS variables.
-      assert(isKnownAddressLDSGlobal(*KV));
       unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *KV, Align());
-      (void)Offset;
-      assert(Offset == calculateKnownAddressOfLDSGlobal(*KV) &&
-             "Kernel LDS expected to be immediately after module LDS");
+      recordLDSAbsoluteAddress(M, KV, Offset);
+    }
+
+    if (Dyn) {
+      setDynLDSAlign(M, F, *Dyn);
     }
   }
 }
@@ -214,8 +198,9 @@
   return {};
 }
 
-void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL,
+void AMDGPUMachineFunction::setDynLDSAlign(Module *M, const Function &F,
                                            const GlobalVariable &GV) {
+  const DataLayout &DL = M->getDataLayout();
   assert(DL.getTypeAllocSize(GV.getValueType()).isZero());
 
   Align Alignment =
@@ -225,4 +210,13 @@
 
   LDSSize = alignTo(StaticLDSSize, Alignment);
   DynLDSAlign = Alignment;
+
+  // If there is a dynamic LDS variable associated with this function, update it
+  // whenever changing the address of dynamic LDS, aka when calling this
+  // function
+  GlobalVariable *Dyn =
+      AMDGPUMachineFunction::getKernelDynLDSGlobalFromFunction(M, F);
+  if (Dyn) {
+    recordLDSAbsoluteAddress(M, Dyn, LDSSize);
+  }
 }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2412,7 +2412,7 @@
     return DAG.getEntryNode();
   }
 
-  Info->allocateKnownAddressLDSGlobal(Fn);
+  Info->allocateKnownAddressLDSGlobal(MF.getFunction().getParent(), Fn);
 
   SmallVector<ISD::InputArg, 16> Splits;
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -6139,7 +6139,8 @@
       if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
         assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
         // Adjust alignment for that dynamic shared memory array.
-        MFI->setDynLDSAlign(DAG.getDataLayout(), *cast<GlobalVariable>(GV));
+        Function &F = DAG.getMachineFunction().getFunction();
+        MFI->setDynLDSAlign(F.getParent(), F, *cast<GlobalVariable>(GV));
         return SDValue(
             DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
       }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -79,24 +79,23 @@
   if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
     return false;
   }
+  if (GV.isConstant()) {
+    // A constant undef variable can't be written to, and any load is
+    // undef, so it should be eliminated by the optimizer. It could be
+    // dropped by the back end if not. This pass skips over it.
+    return false;
+  }
   if (!GV.hasInitializer()) {
     // addrspace(3) without initializer implies cuda/hip extern __shared__
     // the semantics for such a variable appears to be that all extern
-    // __shared__ variables alias one another, in which case this transform
-    // is not required
-    return false;
+    // __shared__ variables alias one another. This hits different handling.
+    return true;
   }
   if (!isa<UndefValue>(GV.getInitializer())) {
     // Initializers are unimplemented for LDS address space.
     // Leave such variables in place for consistent error reporting.
     return false;
   }
-  if (GV.isConstant()) {
-    // A constant undef variable can't be written to, and any load is
-    // undef, so it should be eliminated by the optimizer. It could be
-    // dropped by the back end if not. This pass skips over it.
-    return false;
-  }
   return true;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
@@ -18,10 +18,6 @@
 @const_undef = addrspace(3) constant i32 undef
 @const_with_init = addrspace(3) constant i64 8
 
-; External and constant are both left to the optimizer / error diagnostics
-; CHECK: @extern = external addrspace(3) global i32
-@extern = external addrspace(3) global i32
-
 ; Use of an addrspace(3) variable with an initializer is skipped,
 ; so as to preserve the unimplemented error from llc
 ; CHECK: @with_init = addrspace(3) global i64 0
@@ -47,14 +43,12 @@
 ; CHECK: %c0 = load i32, ptr addrspace(3) @const_undef, align 4
 ; CHECK: %c1 = load i64, ptr addrspace(3) @const_with_init, align 4
 ; CHECK: %v0 = atomicrmw add ptr addrspace(3) @with_init, i64 1 seq_cst
-; CHECK: %v1 = cmpxchg ptr addrspace(3) @extern, i32 4, i32 %c0 acq_rel monotonic
-; CHECK: %v2 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic
+; CHECK: %v1 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic
 define void @use_variables() {
   %c0 = load i32, ptr addrspace(3) @const_undef, align 4
   %c1 = load i64, ptr addrspace(3) @const_with_init, align 4
   %v0 = atomicrmw add ptr addrspace(3) @with_init, i64 1 seq_cst
-  %v1 = cmpxchg ptr addrspace(3) @extern, i32 4, i32 %c0 acq_rel monotonic
-  %v2 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic
+  %v1 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic
   ret void
 }