diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2542,7 +2542,7 @@
       // allocated ones. They all share the same offset.
       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
         // Adjust alignment for that dynamic shared memory array.
-        MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
+        MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
         LLT S32 = LLT::scalar(32);
         auto Sz =
             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -184,6 +184,18 @@
   return AMDGPU::isKernel(F->getCallingConv());
 }
 
+bool isDynamicLDS(const GlobalVariable *GV) {
+  if (AMDGPU::isLDSVariableToLower(*GV)) {
+    if (!GV->hasInitializer()) {
+      // addrspace(3) without initializer implies cuda/hip extern __shared__
+      // the semantics for such a variable appears to be that all extern
+      // __shared__ variables alias one another
+      return true;
+    }
+  }
+  return false;
+}
+
 class AMDGPULowerModuleLDS : public ModulePass {
 
   static void
@@ -468,6 +480,44 @@
         AMDGPUAS::CONSTANT_ADDRESS);
   }
 
+  void replaceUseWithTableLookup(Module &M, IRBuilder<> &Builder,
+                                 GlobalVariable *LookupTable,
+                                 GlobalVariable *GV, Use &U,
+                                 Value *OptionalIndex) {
+    // Table is a constant array of the same length as OrderedKernels
+    // It either contains 32 bit pointers or structs of 32 bit pointers, where
+    // the OptionalIndex argument is used to index into the struct if present.
+    LLVMContext &Ctx = M.getContext();
+    Type *I32 = Type::getInt32Ty(Ctx);
+    auto *I = cast<Instruction>(U.getUser());
+
+    Value *tableKernelIndex = getTableLookupKernelIndex(M, I->getFunction());
+
+    if (auto *Phi = dyn_cast<PHINode>(I)) {
+      BasicBlock *BB = Phi->getIncomingBlock(U);
+      Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt())));
+    } else {
+      Builder.SetInsertPoint(I);
+    }
+
+    SmallVector<Value *, 3> GEPIdx = {
+        ConstantInt::get(I32, 0),
+        tableKernelIndex,
+    };
+    if (OptionalIndex)
+      GEPIdx.push_back(OptionalIndex);
+
+    Value *Address = Builder.CreateInBoundsGEP(
+        LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName());
+
+    Value *loaded = Builder.CreateLoad(I32, Address);
+
+    Value *replacement =
+        Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName());
+
+    U.set(replacement);
+  }
+
   void replaceUsesInInstructionsWithTableLookup(
       Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables,
       GlobalVariable *LookupTable) {
@@ -484,33 +534,8 @@
         if (!I)
           continue;
 
-        Value *tableKernelIndex =
-            getTableLookupKernelIndex(M, I->getFunction());
-
-        // So if the phi uses this value multiple times, what does this look
-        // like?
-        if (auto *Phi = dyn_cast<PHINode>(I)) {
-          BasicBlock *BB = Phi->getIncomingBlock(U);
-          Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt())));
-        } else {
-          Builder.SetInsertPoint(I);
-        }
-
-        Value *GEPIdx[3] = {
-            ConstantInt::get(I32, 0),
-            tableKernelIndex,
-            ConstantInt::get(I32, Index),
-        };
-
-        Value *Address = Builder.CreateInBoundsGEP(
-            LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName());
-
-        Value *loaded = Builder.CreateLoad(I32, Address);
-
-        Value *replacement =
-            Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName());
-
-        U.set(replacement);
+        replaceUseWithTableLookup(M, Builder, LookupTable, GV, U,
+                                  ConstantInt::get(I32, Index));
       }
     }
   }
@@ -632,47 +657,52 @@
   }
 
   static std::vector<Function *> assignLDSKernelIDToEachKernel(
-      Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS) {
+      Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS,
+      DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS) {
     // Associate kernels in the set with an arbirary but reproducible order and
     // annotate them with that order in metadata. This metadata is recognised by
     // the backend and lowered to a SGPR which can be read from using
     // amdgcn_lds_kernel_id.
 
     std::vector<Function *> OrderedKernels;
+    if (!KernelsThatAllocateTableLDS.empty() ||
+        !KernelsThatIndirectlyAllocateDynamicLDS.empty()) {
 
-    for (Function &Func : M->functions()) {
-      if (Func.isDeclaration())
-        continue;
-      if (!isKernelLDS(&Func))
-        continue;
+      for (Function &Func : M->functions()) {
+        if (Func.isDeclaration())
+          continue;
+        if (!isKernelLDS(&Func))
+          continue;
 
-      if (KernelsThatAllocateTableLDS.contains(&Func)) {
-        assert(Func.hasName()); // else fatal error earlier
-        OrderedKernels.push_back(&Func);
+        if (KernelsThatAllocateTableLDS.contains(&Func) ||
+            KernelsThatIndirectlyAllocateDynamicLDS.contains(&Func)) {
+          assert(Func.hasName()); // else fatal error earlier
+          OrderedKernels.push_back(&Func);
+        }
       }
-    }
 
-    // Put them in an arbitrary but reproducible order
-    llvm::sort(OrderedKernels.begin(), OrderedKernels.end(),
-               [](const Function *lhs, const Function *rhs) -> bool {
-                 return lhs->getName() < rhs->getName();
-               });
+      // Put them in an arbitrary but reproducible order
+      llvm::sort(OrderedKernels.begin(), OrderedKernels.end(),
+                 [](const Function *lhs, const Function *rhs) -> bool {
+                   return lhs->getName() < rhs->getName();
+                 });
 
-    // Annotate the kernels with their order in this vector
-    LLVMContext &Ctx = M->getContext();
-    IRBuilder<> Builder(Ctx);
+      // Annotate the kernels with their order in this vector
+      LLVMContext &Ctx = M->getContext();
+      IRBuilder<> Builder(Ctx);
 
-    if (OrderedKernels.size() > UINT32_MAX) {
-      // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU
-      report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels");
-    }
+      if (OrderedKernels.size() > UINT32_MAX) {
+        // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU
+        report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels");
+      }
 
-    for (size_t i = 0; i < OrderedKernels.size(); i++) {
-      Metadata *AttrMDArgs[1] = {
-          ConstantAsMetadata::get(Builder.getInt32(i)),
-      };
-      OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id",
-                                     MDNode::get(Ctx, AttrMDArgs));
+      for (size_t i = 0; i < OrderedKernels.size(); i++) {
+        Metadata *AttrMDArgs[1] = {
+            ConstantAsMetadata::get(Builder.getInt32(i)),
+        };
+        OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id",
+                                       MDNode::get(Ctx, AttrMDArgs));
+      }
     }
 
     return OrderedKernels;
@@ -683,7 +713,8 @@
       VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly,
       DenseSet<GlobalVariable *> &ModuleScopeVariables,
       DenseSet<GlobalVariable *> &TableLookupVariables,
-      DenseSet<GlobalVariable *> &KernelAccessVariables) {
+      DenseSet<GlobalVariable *> &KernelAccessVariables,
+      DenseSet<GlobalVariable *> &DynamicVariables) {
 
     GlobalVariable *HybridModuleRoot =
         LoweringKindLoc != LoweringKind::hybrid
@@ -705,6 +736,11 @@
       assert(AMDGPU::isLDSVariableToLower(*GV));
       assert(K.second.size() != 0);
 
+      if (isDynamicLDS(GV)) {
+        DynamicVariables.insert(GV);
+        continue;
+      }
+
       switch (LoweringKindLoc) {
       case LoweringKind::module:
         ModuleScopeVariables.insert(GV);
@@ -743,7 +779,7 @@
     // All LDS variables accessed indirectly have now been partitioned into
     // the distinct lowering strategies.
     assert(ModuleScopeVariables.size() + TableLookupVariables.size() +
-               KernelAccessVariables.size() ==
+               KernelAccessVariables.size() + DynamicVariables.size() ==
            LDSToKernelsThatNeedToAccessItIndirectly.size());
   }
 
@@ -813,7 +849,7 @@
         markUsedByKernel(Builder, &Func, ModuleScopeReplacement.SGV);
 
       } else {
-        Func.addFnAttr("amdgpu-elide-module-lds");
+        markElideModuleLDS(Func);
       }
     }
 
@@ -838,12 +874,15 @@
       // Allocating variables that are used directly in this struct to get
       // alignment aware allocation and predictable frame size.
       for (auto &v : LDSUsesInfo.direct_access[&Func]) {
-        KernelUsedVariables.insert(v);
+        if (!isDynamicLDS(v)) {
+          KernelUsedVariables.insert(v);
+        }
       }
 
       // Allocating variables that are accessed indirectly so that a lookup of
       // this struct instance can find them from nested functions.
       for (auto &v : LDSUsesInfo.indirect_access[&Func]) {
+        assert(!isDynamicLDS(v));
         KernelUsedVariables.insert(v);
       }
 
@@ -857,6 +896,7 @@
 
       if (KernelUsedVariables.empty()) {
         // Either used no LDS, or the LDS it used was all in the module struct
+        // or dyanmically sized
         continue;
       }
 
@@ -876,33 +916,6 @@
       auto Replacement =
           createLDSVariableReplacement(M, VarName, KernelUsedVariables);
 
-      // This struct is allocated at a predictable address that can be
-      // calculated now, recorded in metadata then used to lower references to
-      // it during codegen.
-      {
-        // frame layout, starting from 0
-        //{
-        //  module.lds
-        //  alignment padding
-        //  kernel instance
-        //}
-
-        if (!MaybeModuleScopeStruct ||
-            Func.hasFnAttribute("amdgpu-elide-module-lds")) {
-          // There's no module.lds for this kernel so this replacement struct
-          // goes first
-          recordLDSAbsoluteAddress(&M, Replacement.SGV, 0);
-        } else {
-          const DataLayout &DL = M.getDataLayout();
-          TypeSize ModuleSize =
-              DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType());
-          GlobalVariable *KernelStruct = Replacement.SGV;
-          Align KernelAlign = AMDGPU::getAlign(DL, KernelStruct);
-          recordLDSAbsoluteAddress(&M, Replacement.SGV,
-                                   alignTo(ModuleSize, KernelAlign));
-        }
-      }
-
       // remove preserves existing codegen
       removeLocalVarsFromUsedLists(M, KernelUsedVariables);
       KernelToReplacement[&Func] = Replacement;
@@ -917,6 +930,124 @@
     return KernelToReplacement;
   }
 
+  static GlobalVariable *
+  buildRepresentativeDynamicLDSInstance(Module &M, LDSUsesInfoTy &LDSUsesInfo,
+                                        Function *func) {
+    // Create a dynamic lds variable with a name associated with the passed
+    // function that has the maximum alignment of any dynamic lds variable
+    // reachable from this kernel. Dynamic LDS is allocated after the static LDS
+    // allocation, possibly after alignment padding. The representative variable
+    // created here has the maximum alignment of any other dynamic variable
+    // reachable by that kernel. All dynamic LDS variables are allocated at the
+    // same address in each kernel in order to provide the documented aliasing
+    // semantics. Setting the alignment here allows this IR pass to accurately
+    // predict the exact constant at which it will be allocated.
+
+    assert(isKernelLDS(func));
+
+    LLVMContext &Ctx = M.getContext();
+    const DataLayout &DL = M.getDataLayout();
+    Align MaxDynamicAlignment(1);
+
+    auto UpdateMaxAlignment = [&MaxDynamicAlignment, &DL](GlobalVariable *GV) {
+      if (!isDynamicLDS(GV)) {
+        MaxDynamicAlignment =
+            std::max(MaxDynamicAlignment, AMDGPU::getAlign(DL, GV));
+      }
+    };
+
+    for (GlobalVariable *GV : LDSUsesInfo.indirect_access[func]) {
+      UpdateMaxAlignment(GV);
+    }
+
+    for (GlobalVariable *GV : LDSUsesInfo.direct_access[func]) {
+      UpdateMaxAlignment(GV);
+    }
+
+    assert(func->hasName()); // Checked by caller
+    std::string VarName =
+        Twine("llvm.amdgcn." + func->getName() + ".dynlds").str();
+
+    auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0);
+    GlobalVariable *N = new GlobalVariable(
+        M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr,
+        VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
+        false);
+    N->setAlignment(MaxDynamicAlignment);
+
+    assert(isDynamicLDS(N));
+    return N;
+  }
+
+  DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables(
+      Module &M, LDSUsesInfoTy &LDSUsesInfo,
+      DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS,
+      DenseSet<GlobalVariable *> const &DynamicVariables,
+      std::vector<Function *> const &OrderedKernels) {
+    DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS;
+    if (!KernelsThatIndirectlyAllocateDynamicLDS.empty()) {
+      LLVMContext &Ctx = M.getContext();
+      IRBuilder<> Builder(Ctx);
+      Type *I32 = Type::getInt32Ty(Ctx);
+
+      std::vector<Constant *> newDynamicLDS;
+
+      // Table is built in the same order as OrderedKernels
+      for (auto &func : OrderedKernels) {
+
+        if (KernelsThatIndirectlyAllocateDynamicLDS.contains(func)) {
+          assert(isKernelLDS(func));
+          if (!func->hasName()) {
+            report_fatal_error("Anonymous kernels cannot use LDS variables");
+          }
+
+          GlobalVariable *N =
+              buildRepresentativeDynamicLDSInstance(M, LDSUsesInfo, func);
+
+          KernelToCreatedDynamicLDS[func] = N;
+
+          markUsedByKernel(Builder, func, N);
+
+          auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0);
+          auto GEP = ConstantExpr::getGetElementPtr(
+              emptyCharArray, N, ConstantInt::get(I32, 0), true);
+          newDynamicLDS.push_back(ConstantExpr::getPtrToInt(GEP, I32));
+        } else {
+          newDynamicLDS.push_back(PoisonValue::get(I32));
+        }
+      }
+      assert(OrderedKernels.size() == newDynamicLDS.size());
+
+      ArrayType *t = ArrayType::get(I32, newDynamicLDS.size());
+      Constant *init = ConstantArray::get(t, newDynamicLDS);
+      GlobalVariable *table = new GlobalVariable(
+          M, t, true, GlobalValue::InternalLinkage, init,
+          "llvm.amdgcn.dynlds.offset.table", nullptr,
+          GlobalValue::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS);
+
+      for (GlobalVariable *GV : DynamicVariables) {
+        for (Use &U : make_early_inc_range(GV->uses())) {
+          auto *I = dyn_cast<Instruction>(U.getUser());
+          if (!I)
+            continue;
+          if (isKernelLDS(I->getFunction()))
+            continue;
+
+          replaceUseWithTableLookup(M, Builder, table, GV, U, nullptr);
+        }
+      }
+    }
+    return KernelToCreatedDynamicLDS;
+  }
+
+  static bool canElideModuleLDS(const Function &F) {
+    return F.hasFnAttribute("amdgpu-elide-module-lds");
+  }
+
+  static void markElideModuleLDS(Function &F) {
+    F.addFnAttr("amdgpu-elide-module-lds");
+  }
+
   bool runOnModule(Module &M) override {
     CallGraph CG = CallGraph(M);
     bool Changed = superAlignLDSGlobals(M);
@@ -939,12 +1070,15 @@
       }
     }
 
+    // Partition variables accessed indirectly into the different strategies
     DenseSet<GlobalVariable *> ModuleScopeVariables;
     DenseSet<GlobalVariable *> TableLookupVariables;
     DenseSet<GlobalVariable *> KernelAccessVariables;
+    DenseSet<GlobalVariable *> DynamicVariables;
     partitionVariablesIntoIndirectStrategies(
         M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly,
-        ModuleScopeVariables, TableLookupVariables, KernelAccessVariables);
+        ModuleScopeVariables, TableLookupVariables, KernelAccessVariables,
+        DynamicVariables);
 
     // If the kernel accesses a variable that is going to be stored in the
     // module instance through a call then that kernel needs to allocate the
@@ -956,6 +1090,10 @@
         kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
                                                         TableLookupVariables);
 
+    DenseSet<Function *> KernelsThatIndirectlyAllocateDynamicLDS =
+        kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
+                                                        DynamicVariables);
+
     GlobalVariable *MaybeModuleScopeStruct = lowerModuleScopeStructVariables(
         M, ModuleScopeVariables, KernelsThatAllocateModuleLDS);
 
@@ -981,14 +1119,15 @@
       });
     }
 
+    // The ith element of this vector is kernel id i
+    std::vector<Function *> OrderedKernels =
+        assignLDSKernelIDToEachKernel(&M, KernelsThatAllocateTableLDS,
+                                      KernelsThatIndirectlyAllocateDynamicLDS);
+
     if (!KernelsThatAllocateTableLDS.empty()) {
       LLVMContext &Ctx = M.getContext();
       IRBuilder<> Builder(Ctx);
 
-      // The ith element of this vector is kernel id i
-      std::vector<Function *> OrderedKernels =
-          assignLDSKernelIDToEachKernel(&M, KernelsThatAllocateTableLDS);
-
       for (size_t i = 0; i < OrderedKernels.size(); i++) {
         markUsedByKernel(Builder, OrderedKernels[i],
                          KernelToReplacement[OrderedKernels[i]].SGV);
@@ -1010,6 +1149,67 @@
                                                LookupTable);
     }
 
+    DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS =
+        lowerDynamicLDSVariables(M, LDSUsesInfo,
+                                 KernelsThatIndirectlyAllocateDynamicLDS,
+                                 DynamicVariables, OrderedKernels);
+
+    // All kernel frames have been allocated. Calculate and record the
+    // addresses.
+
+    {
+      const DataLayout &DL = M.getDataLayout();
+
+      for (Function &Func : M.functions()) {
+        if (Func.isDeclaration() || !isKernelLDS(&Func))
+          continue;
+
+        // All three of these are optional. The first variable is allocated at
+        // zero. They are allocated by allocateKnownAddressLDSGlobal in the
+        // following order:
+        //{
+        //  module.lds
+        //  alignment padding
+        //  kernel instance
+        //  alignment padding
+        //  dynamic lds variables
+        //}
+
+        const bool AllocateModuleScopeStruct =
+            MaybeModuleScopeStruct && !canElideModuleLDS(Func);
+
+        const bool AllocateKernelScopeStruct =
+            KernelToReplacement.contains(&Func);
+
+        const bool AllocateDynamicVariable =
+            KernelToCreatedDynamicLDS.contains(&Func);
+
+        uint32_t Offset = 0;
+
+        if (AllocateModuleScopeStruct) {
+          // Allocated at zero, recorded once on construction, not once per
+          // kernel
+          Offset += DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType());
+        }
+
+        if (AllocateKernelScopeStruct) {
+          GlobalVariable *KernelStruct = KernelToReplacement[&Func].SGV;
+
+          Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct));
+
+          recordLDSAbsoluteAddress(&M, KernelStruct, Offset);
+        }
+
+        if (AllocateDynamicVariable) {
+          GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func];
+
+          Offset = alignTo(Offset, AMDGPU::getAlign(DL, DynamicVariable));
+
+          recordLDSAbsoluteAddress(&M, DynamicVariable, Offset);
+        }
+      }
+    }
+
     for (auto &GV : make_early_inc_range(M.globals()))
       if (AMDGPU::isLDSVariableToLower(GV)) {
         // probably want to remove from used lists
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -111,7 +111,7 @@
 
   Align getDynLDSAlign() const { return DynLDSAlign; }
 
-  void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV);
+  void setDynLDSAlign(const Function &F, const GlobalVariable &GV);
 };
 
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -99,6 +99,15 @@
   return M->getNamedGlobal(KernelLDSName);
 }
 
+static const GlobalVariable *
+getKernelDynLDSGlobalFromFunction(const Function &F) {
+  const Module *M = F.getParent();
+  std::string KernelDynLDSName = "llvm.amdgcn.";
+  KernelDynLDSName += F.getName();
+  KernelDynLDSName += ".dynlds";
+  return M->getNamedGlobal(KernelDynLDSName);
+}
+
 // This kernel calls no functions that require the module lds struct
 static bool canElideModuleLDS(const Function &F) {
   return F.hasFnAttribute("amdgpu-elide-module-lds");
@@ -131,6 +140,7 @@
 
     const GlobalVariable *GV = M->getNamedGlobal(ModuleLDSName);
     const GlobalVariable *KV = getKernelLDSGlobalFromFunction(F);
+    const GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(F);
 
     if (GV && !canElideModuleLDS(F)) {
       unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV, Align());
@@ -149,6 +159,19 @@
         report_fatal_error("Inconsistent metadata on kernel LDS variable");
       }
     }
+
+    if (Dyn) {
+      // The dynamic LDS is deterministic because the per-kernel one has the
+      // maximum alignment of any reachable and all remaining LDS variables,
+      // if this is present, are themselves dynamic LDS and will be allocated
+      // at the same address.
+      setDynLDSAlign(F, *Dyn);
+      unsigned Offset = LDSSize;
+      std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*Dyn);
+      if (!Expect || (Offset != Expect)) {
+        report_fatal_error("Inconsistent metadata on dynamic LDS variable");
+      }
+    }
   }
 }
 
@@ -187,8 +210,10 @@
   return {};
 }
 
-void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL,
+void AMDGPUMachineFunction::setDynLDSAlign(const Function &F,
                                            const GlobalVariable &GV) {
+  const Module *M = F.getParent();
+  const DataLayout &DL = M->getDataLayout();
   assert(DL.getTypeAllocSize(GV.getValueType()).isZero());
 
   Align Alignment =
@@ -198,4 +223,17 @@
 
   LDSSize = alignTo(StaticLDSSize, Alignment);
   DynLDSAlign = Alignment;
+
+  // If there is a dynamic LDS variable associated with this function F, every
+  // further dynamic LDS instance (allocated by calling setDynLDSAlign) must
+  // map to the same address. This holds because no LDS is allocated after the
+  // lowering pass if there are dynamic LDS variables present.
+  const GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(F);
+  if (Dyn) {
+    unsigned Offset = LDSSize; // return this?
+    std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*Dyn);
+    if (!Expect || (Offset != Expect)) {
+      report_fatal_error("Inconsistent metadata on dynamic LDS variable");
+    }
+  }
 }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6139,7 +6139,8 @@
       if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
         assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
         // Adjust alignment for that dynamic shared memory array.
-        MFI->setDynLDSAlign(DAG.getDataLayout(), *cast<GlobalVariable>(GV));
+        Function &F = DAG.getMachineFunction().getFunction();
+        MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
         return SDValue(
             DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
       }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -79,24 +79,23 @@
   if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
     return false;
   }
+  if (GV.isConstant()) {
+    // A constant undef variable can't be written to, and any load is
+    // undef, so it should be eliminated by the optimizer. It could be
+    // dropped by the back end if not. This pass skips over it.
+    return false;
+  }
   if (!GV.hasInitializer()) {
     // addrspace(3) without initializer implies cuda/hip extern __shared__
     // the semantics for such a variable appears to be that all extern
-    // __shared__ variables alias one another, in which case this transform
-    // is not required
-    return false;
+    // __shared__ variables alias one another. This hits different handling.
+    return true;
   }
   if (!isa<UndefValue>(GV.getInitializer())) {
     // Initializers are unimplemented for LDS address space.
     // Leave such variables in place for consistent error reporting.
     return false;
   }
-  if (GV.isConstant()) {
-    // A constant undef variable can't be written to, and any load is
-    // undef, so it should be eliminated by the optimizer. It could be
-    // dropped by the back end if not. This pass skips over it.
-    return false;
-  }
   return true;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
@@ -18,10 +18,6 @@
 @const_undef = addrspace(3) constant i32 undef
 @const_with_init = addrspace(3) constant i64 8
 
-; External and constant are both left to the optimizer / error diagnostics
-; CHECK: @extern = external addrspace(3) global i32
-@extern = external addrspace(3) global i32
-
 ; Use of an addrspace(3) variable with an initializer is skipped,
 ; so as to preserve the unimplemented error from llc
 ; CHECK: @with_init = addrspace(3) global i64 0
@@ -47,14 +43,12 @@
 ; CHECK: %c0 = load i32, ptr addrspace(3) @const_undef, align 4
 ; CHECK: %c1 = load i64, ptr addrspace(3) @const_with_init, align 4
 ; CHECK: %v0 = atomicrmw add ptr addrspace(3) @with_init, i64 1 seq_cst
-; CHECK: %v1 = cmpxchg ptr addrspace(3) @extern, i32 4, i32 %c0 acq_rel monotonic
-; CHECK: %v2 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic
+; CHECK: %v1 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic
 define void @use_variables() {
   %c0 = load i32, ptr addrspace(3) @const_undef, align 4
   %c1 = load i64, ptr addrspace(3) @const_with_init, align 4
   %v0 = atomicrmw add ptr addrspace(3) @with_init, i64 1 seq_cst
-  %v1 = cmpxchg ptr addrspace(3) @extern, i32 4, i32 %c0 acq_rel monotonic
-  %v2 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic
+  %v1 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic
   ret void
 }