diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -127,7 +127,7 @@
     unsigned AS = GV.getAddressSpace();
     if ((AS == AMDGPUAS::REGION_ADDRESS) ||
         (AS == AMDGPUAS::LOCAL_ADDRESS &&
-         (!AMDGPUTargetMachine::EnableLowerModuleLDS || !GV.hasInitializer())))
+         (!AMDGPUTargetMachine::EnableLowerModuleLDS)))
       recursivelyVisitUsers(GV, FuncsToAlwaysInline);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2542,7 +2542,7 @@
       // allocated ones. They all share the same offset.
       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
         // Adjust alignment for that dynamic shared memory array.
-        MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
+        MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
         LLT S32 = LLT::scalar(32);
         auto Sz =
             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -184,6 +184,18 @@
   return AMDGPU::isKernel(F->getCallingConv());
 }
 
+bool isDynamicLDS(const GlobalVariable *GV) {
+  if (AMDGPU::isLDSVariableToLower(*GV)) {
+    if (!GV->hasInitializer()) {
+      // addrspace(3) without initializer implies cuda/hip extern __shared__
+      // the semantics for such a variable appears to be that all extern
+      // __shared__ variables alias one another
+      return true;
+    }
+  }
+  return false;
+}
+
 class AMDGPULowerModuleLDS : public ModulePass {
 
   static void
@@ -468,6 +480,44 @@
         AMDGPUAS::CONSTANT_ADDRESS);
   }
 
+  void replaceUseWithTableLookup(Module &M, IRBuilder<> &Builder,
+                                 GlobalVariable *LookupTable,
+                                 GlobalVariable *GV, Use &U,
+                                 Value *OptionalIndex) {
+    // Table is a constant array of the same length as OrderedKernels
+    // It either contains 32 bit pointers or structs of 32 bit pointers, where
+    // the OptionalIndex argument is used to index into the struct if present.
+    LLVMContext &Ctx = M.getContext();
+    Type *I32 = Type::getInt32Ty(Ctx);
+    auto *I = cast<Instruction>(U.getUser());
+
+    Value *tableKernelIndex = getTableLookupKernelIndex(M, I->getFunction());
+
+    if (auto *Phi = dyn_cast<PHINode>(I)) {
+      BasicBlock *BB = Phi->getIncomingBlock(U);
+      Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt())));
+    } else {
+      Builder.SetInsertPoint(I);
+    }
+
+    SmallVector<Value *, 3> GEPIdx = {
+        ConstantInt::get(I32, 0),
+        tableKernelIndex,
+    };
+    if (OptionalIndex)
+      GEPIdx.push_back(OptionalIndex);
+
+    Value *Address = Builder.CreateInBoundsGEP(
+        LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName());
+
+    Value *loaded = Builder.CreateLoad(I32, Address);
+
+    Value *replacement =
+        Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName());
+
+    U.set(replacement);
+  }
+
   void replaceUsesInInstructionsWithTableLookup(
       Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables,
       GlobalVariable *LookupTable) {
@@ -484,33 +534,8 @@
         if (!I)
           continue;
 
-        Value *tableKernelIndex =
-            getTableLookupKernelIndex(M, I->getFunction());
-
-        // So if the phi uses this value multiple times, what does this look
-        // like?
-        if (auto *Phi = dyn_cast<PHINode>(I)) {
-          BasicBlock *BB = Phi->getIncomingBlock(U);
-          Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt())));
-        } else {
-          Builder.SetInsertPoint(I);
-        }
-
-        Value *GEPIdx[3] = {
-            ConstantInt::get(I32, 0),
-            tableKernelIndex,
-            ConstantInt::get(I32, Index),
-        };
-
-        Value *Address = Builder.CreateInBoundsGEP(
-            LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName());
-
-        Value *loaded = Builder.CreateLoad(I32, Address);
-
-        Value *replacement =
-            Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName());
-
-        U.set(replacement);
+        replaceUseWithTableLookup(M, Builder, LookupTable, GV, U,
+                                  ConstantInt::get(I32, Index));
       }
     }
   }
@@ -632,47 +657,52 @@
   }
 
   static std::vector<Function *> assignLDSKernelIDToEachKernel(
-      Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS) {
+      Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS,
+      DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS) {
     // Associate kernels in the set with an arbirary but reproducible order and
     // annotate them with that order in metadata. This metadata is recognised by
     // the backend and lowered to a SGPR which can be read from using
     // amdgcn_lds_kernel_id.
 
     std::vector<Function *> OrderedKernels;
+    if (!KernelsThatAllocateTableLDS.empty() ||
+        !KernelsThatIndirectlyAllocateDynamicLDS.empty()) {
 
-    for (Function &Func : M->functions()) {
-      if (Func.isDeclaration())
-        continue;
-      if (!isKernelLDS(&Func))
-        continue;
+      for (Function &Func : M->functions()) {
+        if (Func.isDeclaration())
+          continue;
+        if (!isKernelLDS(&Func))
+          continue;
 
-      if (KernelsThatAllocateTableLDS.contains(&Func)) {
-        assert(Func.hasName()); // else fatal error earlier
-        OrderedKernels.push_back(&Func);
+        if (KernelsThatAllocateTableLDS.contains(&Func) ||
+            KernelsThatIndirectlyAllocateDynamicLDS.contains(&Func)) {
+          assert(Func.hasName()); // else fatal error earlier
+          OrderedKernels.push_back(&Func);
+        }
       }
-    }
 
-    // Put them in an arbitrary but reproducible order
-    llvm::sort(OrderedKernels.begin(), OrderedKernels.end(),
-               [](const Function *lhs, const Function *rhs) -> bool {
-                 return lhs->getName() < rhs->getName();
-               });
+      // Put them in an arbitrary but reproducible order
+      llvm::sort(OrderedKernels.begin(), OrderedKernels.end(),
+                 [](const Function *lhs, const Function *rhs) -> bool {
+                   return lhs->getName() < rhs->getName();
+                 });
 
-    // Annotate the kernels with their order in this vector
-    LLVMContext &Ctx = M->getContext();
-    IRBuilder<> Builder(Ctx);
+      // Annotate the kernels with their order in this vector
+      LLVMContext &Ctx = M->getContext();
+      IRBuilder<> Builder(Ctx);
 
-    if (OrderedKernels.size() > UINT32_MAX) {
-      // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU
-      report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels");
-    }
+      if (OrderedKernels.size() > UINT32_MAX) {
+        // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU
+        report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels");
+      }
 
-    for (size_t i = 0; i < OrderedKernels.size(); i++) {
-      Metadata *AttrMDArgs[1] = {
-          ConstantAsMetadata::get(Builder.getInt32(i)),
-      };
-      OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id",
-                                     MDNode::get(Ctx, AttrMDArgs));
+      for (size_t i = 0; i < OrderedKernels.size(); i++) {
+        Metadata *AttrMDArgs[1] = {
+            ConstantAsMetadata::get(Builder.getInt32(i)),
+        };
+        OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id",
+                                       MDNode::get(Ctx, AttrMDArgs));
+      }
     }
 
     return OrderedKernels;
@@ -683,7 +713,8 @@
       VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly,
       DenseSet<GlobalVariable *> &ModuleScopeVariables,
       DenseSet<GlobalVariable *> &TableLookupVariables,
-      DenseSet<GlobalVariable *> &KernelAccessVariables) {
+      DenseSet<GlobalVariable *> &KernelAccessVariables,
+      DenseSet<GlobalVariable *> &DynamicVariables) {
 
     GlobalVariable *HybridModuleRoot =
         LoweringKindLoc != LoweringKind::hybrid
@@ -705,6 +736,11 @@
       assert(AMDGPU::isLDSVariableToLower(*GV));
       assert(K.second.size() != 0);
 
+      if (isDynamicLDS(GV)) {
+        DynamicVariables.insert(GV);
+        continue;
+      }
+
       switch (LoweringKindLoc) {
       case LoweringKind::module:
         ModuleScopeVariables.insert(GV);
@@ -743,7 +779,7 @@
     // All LDS variables accessed indirectly have now been partitioned into
     // the distinct lowering strategies.
     assert(ModuleScopeVariables.size() + TableLookupVariables.size() +
-               KernelAccessVariables.size() ==
+               KernelAccessVariables.size() + DynamicVariables.size() ==
            LDSToKernelsThatNeedToAccessItIndirectly.size());
   }
 
@@ -813,7 +849,7 @@
         markUsedByKernel(Builder, &Func, ModuleScopeReplacement.SGV);
 
       } else {
-        Func.addFnAttr("amdgpu-elide-module-lds");
+        markElideModuleLDS(Func);
       }
     }
 
@@ -838,13 +874,17 @@
       // Allocating variables that are used directly in this struct to get
       // alignment aware allocation and predictable frame size.
       for (auto &v : LDSUsesInfo.direct_access[&Func]) {
-        KernelUsedVariables.insert(v);
+        if (!isDynamicLDS(v)) {
+          KernelUsedVariables.insert(v);
+        }
       }
 
       // Allocating variables that are accessed indirectly so that a lookup of
       // this struct instance can find them from nested functions.
       for (auto &v : LDSUsesInfo.indirect_access[&Func]) {
-        KernelUsedVariables.insert(v);
+        if (!isDynamicLDS(v)) {
+          KernelUsedVariables.insert(v);
+        }
       }
 
       // Variables allocated in module lds must all resolve to that struct,
@@ -857,6 +897,7 @@
 
       if (KernelUsedVariables.empty()) {
         // Either used no LDS, or the LDS it used was all in the module struct
+        // or dyanmically sized
         continue;
       }
 
@@ -876,33 +917,6 @@
       auto Replacement =
           createLDSVariableReplacement(M, VarName, KernelUsedVariables);
 
-      // This struct is allocated at a predictable address that can be
-      // calculated now, recorded in metadata then used to lower references to
-      // it during codegen.
-      {
-        // frame layout, starting from 0
-        //{
-        //  module.lds
-        //  alignment padding
-        //  kernel instance
-        //}
-
-        if (!MaybeModuleScopeStruct ||
-            Func.hasFnAttribute("amdgpu-elide-module-lds")) {
-          // There's no module.lds for this kernel so this replacement struct
-          // goes first
-          recordLDSAbsoluteAddress(&M, Replacement.SGV, 0);
-        } else {
-          const DataLayout &DL = M.getDataLayout();
-          TypeSize ModuleSize =
-              DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType());
-          GlobalVariable *KernelStruct = Replacement.SGV;
-          Align KernelAlign = AMDGPU::getAlign(DL, KernelStruct);
-          recordLDSAbsoluteAddress(&M, Replacement.SGV,
-                                   alignTo(ModuleSize, KernelAlign));
-        }
-      }
-
       // remove preserves existing codegen
       removeLocalVarsFromUsedLists(M, KernelUsedVariables);
       KernelToReplacement[&Func] = Replacement;
@@ -917,6 +931,124 @@
     return KernelToReplacement;
   }
 
+  static GlobalVariable *
+  buildRepresentativeDynamicLDSInstance(Module &M, LDSUsesInfoTy &LDSUsesInfo,
+                                        Function *func) {
+    // Create a dynamic lds variable with a name associated with the passed
+    // function that has the maximum alignment of any dynamic lds variable
+    // reachable from this kernel. Dynamic LDS is allocated after the static LDS
+    // allocation, possibly after alignment padding. The representative variable
+    // created here has the maximum alignment of any other dynamic variable
+    // reachable by that kernel. All dynamic LDS variables are allocated at the
+    // same address in each kernel in order to provide the documented aliasing
+    // semantics. Setting the alignment here allows this IR pass to accurately
+    // predict the exact constant at which it will be allocated.
+
+    assert(isKernelLDS(func));
+
+    LLVMContext &Ctx = M.getContext();
+    const DataLayout &DL = M.getDataLayout();
+    Align MaxDynamicAlignment(1);
+
+    auto UpdateMaxAlignment = [&MaxDynamicAlignment, &DL](GlobalVariable *GV) {
+      if (isDynamicLDS(GV)) {
+        MaxDynamicAlignment =
+            std::max(MaxDynamicAlignment, AMDGPU::getAlign(DL, GV));
+      }
+    };
+
+    for (GlobalVariable *GV : LDSUsesInfo.indirect_access[func]) {
+      UpdateMaxAlignment(GV);
+    }
+
+    for (GlobalVariable *GV : LDSUsesInfo.direct_access[func]) {
+      UpdateMaxAlignment(GV);
+    }
+
+    assert(func->hasName()); // Checked by caller
+    std::string VarName =
+        Twine("llvm.amdgcn." + func->getName() + ".dynlds").str();
+
+    auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0);
+    GlobalVariable *N = new GlobalVariable(
+        M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr,
+        VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
+        false);
+    N->setAlignment(MaxDynamicAlignment);
+
+    assert(isDynamicLDS(N));
+    return N;
+  }
+
+  DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables(
+      Module &M, LDSUsesInfoTy &LDSUsesInfo,
+      DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS,
+      DenseSet<GlobalVariable *> const &DynamicVariables,
+      std::vector<Function *> const &OrderedKernels) {
+    DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS;
+    if (!KernelsThatIndirectlyAllocateDynamicLDS.empty()) {
+      LLVMContext &Ctx = M.getContext();
+      IRBuilder<> Builder(Ctx);
+      Type *I32 = Type::getInt32Ty(Ctx);
+
+      std::vector<Constant *> newDynamicLDS;
+
+      // Table is built in the same order as OrderedKernels
+      for (auto &func : OrderedKernels) {
+
+        if (KernelsThatIndirectlyAllocateDynamicLDS.contains(func)) {
+          assert(isKernelLDS(func));
+          if (!func->hasName()) {
+            report_fatal_error("Anonymous kernels cannot use LDS variables");
+          }
+
+          GlobalVariable *N =
+              buildRepresentativeDynamicLDSInstance(M, LDSUsesInfo, func);
+
+          KernelToCreatedDynamicLDS[func] = N;
+
+          markUsedByKernel(Builder, func, N);
+
+          auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0);
+          auto GEP = ConstantExpr::getGetElementPtr(
+              emptyCharArray, N, ConstantInt::get(I32, 0), true);
+          newDynamicLDS.push_back(ConstantExpr::getPtrToInt(GEP, I32));
+        } else {
+          newDynamicLDS.push_back(PoisonValue::get(I32));
+        }
+      }
+      assert(OrderedKernels.size() == newDynamicLDS.size());
+
+      ArrayType *t = ArrayType::get(I32, newDynamicLDS.size());
+      Constant *init = ConstantArray::get(t, newDynamicLDS);
+      GlobalVariable *table = new GlobalVariable(
+          M, t, true, GlobalValue::InternalLinkage, init,
+          "llvm.amdgcn.dynlds.offset.table", nullptr,
+          GlobalValue::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS);
+
+      for (GlobalVariable *GV : DynamicVariables) {
+        for (Use &U : make_early_inc_range(GV->uses())) {
+          auto *I = dyn_cast<Instruction>(U.getUser());
+          if (!I)
+            continue;
+          if (isKernelLDS(I->getFunction()))
+            continue;
+
+          replaceUseWithTableLookup(M, Builder, table, GV, U, nullptr);
+        }
+      }
+    }
+    return KernelToCreatedDynamicLDS;
+  }
+
+  static bool canElideModuleLDS(const Function &F) {
+    return F.hasFnAttribute("amdgpu-elide-module-lds");
+  }
+
+  static void markElideModuleLDS(Function &F) {
+    F.addFnAttr("amdgpu-elide-module-lds");
+  }
+
   bool runOnModule(Module &M) override {
     CallGraph CG = CallGraph(M);
     bool Changed = superAlignLDSGlobals(M);
@@ -939,12 +1071,15 @@
       }
     }
 
+    // Partition variables accessed indirectly into the different strategies
     DenseSet<GlobalVariable *> ModuleScopeVariables;
     DenseSet<GlobalVariable *> TableLookupVariables;
     DenseSet<GlobalVariable *> KernelAccessVariables;
+    DenseSet<GlobalVariable *> DynamicVariables;
     partitionVariablesIntoIndirectStrategies(
         M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly,
-        ModuleScopeVariables, TableLookupVariables, KernelAccessVariables);
+        ModuleScopeVariables, TableLookupVariables, KernelAccessVariables,
+        DynamicVariables);
 
     // If the kernel accesses a variable that is going to be stored in the
     // module instance through a call then that kernel needs to allocate the
@@ -956,6 +1091,10 @@
         kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
                                                         TableLookupVariables);
 
+    DenseSet<Function *> KernelsThatIndirectlyAllocateDynamicLDS =
+        kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
+                                                        DynamicVariables);
+
     GlobalVariable *MaybeModuleScopeStruct = lowerModuleScopeStructVariables(
         M, ModuleScopeVariables, KernelsThatAllocateModuleLDS);
 
@@ -981,14 +1120,15 @@
       });
     }
 
+    // The ith element of this vector is kernel id i
+    std::vector<Function *> OrderedKernels =
+        assignLDSKernelIDToEachKernel(&M, KernelsThatAllocateTableLDS,
+                                      KernelsThatIndirectlyAllocateDynamicLDS);
+
     if (!KernelsThatAllocateTableLDS.empty()) {
       LLVMContext &Ctx = M.getContext();
       IRBuilder<> Builder(Ctx);
 
-      // The ith element of this vector is kernel id i
-      std::vector<Function *> OrderedKernels =
-          assignLDSKernelIDToEachKernel(&M, KernelsThatAllocateTableLDS);
-
       for (size_t i = 0; i < OrderedKernels.size(); i++) {
         markUsedByKernel(Builder, OrderedKernels[i],
                          KernelToReplacement[OrderedKernels[i]].SGV);
@@ -1010,6 +1150,70 @@
                                                LookupTable);
     }
 
+    DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS =
+        lowerDynamicLDSVariables(M, LDSUsesInfo,
+                                 KernelsThatIndirectlyAllocateDynamicLDS,
+                                 DynamicVariables, OrderedKernels);
+
+    // All kernel frames have been allocated. Calculate and record the
+    // addresses.
+
+    {
+      const DataLayout &DL = M.getDataLayout();
+
+      for (Function &Func : M.functions()) {
+        if (Func.isDeclaration() || !isKernelLDS(&Func))
+          continue;
+
+        // All three of these are optional. The first variable is allocated at
+        // zero. They are allocated by allocateKnownAddressLDSGlobal in the
+        // following order:
+        //{
+        //  module.lds
+        //  alignment padding
+        //  kernel instance
+        //  alignment padding
+        //  dynamic lds variables
+        //}
+
+        const bool AllocateModuleScopeStruct =
+            MaybeModuleScopeStruct && !canElideModuleLDS(Func);
+
+        const bool AllocateKernelScopeStruct =
+            KernelToReplacement.contains(&Func);
+
+        const bool AllocateDynamicVariable =
+            KernelToCreatedDynamicLDS.contains(&Func);
+
+        uint32_t Offset = 0;
+
+        if (AllocateModuleScopeStruct) {
+          // Allocated at zero, recorded once on construction, not once per
+          // kernel
+          Offset += DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType());
+        }
+
+        if (AllocateKernelScopeStruct) {
+          GlobalVariable *KernelStruct = KernelToReplacement[&Func].SGV;
+
+          Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct));
+
+          recordLDSAbsoluteAddress(&M, KernelStruct, Offset);
+
+          Offset += DL.getTypeAllocSize(KernelStruct->getValueType());
+
+        }
+
+        if (AllocateDynamicVariable) {
+          GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func];
+
+          Offset = alignTo(Offset, AMDGPU::getAlign(DL, DynamicVariable));
+
+          recordLDSAbsoluteAddress(&M, DynamicVariable, Offset);
+        }
+      }
+    }
+
     for (auto &GV : make_early_inc_range(M.globals()))
       if (AMDGPU::isLDSVariableToLower(GV)) {
         // probably want to remove from used lists
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -111,7 +111,7 @@
 
   Align getDynLDSAlign() const { return DynLDSAlign; }
 
-  void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV);
+  void setDynLDSAlign(const Function &F, const GlobalVariable &GV);
 };
 
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -99,6 +99,15 @@
   return M->getNamedGlobal(KernelLDSName);
 }
 
+static const GlobalVariable *
+getKernelDynLDSGlobalFromFunction(const Function &F) {
+  const Module *M = F.getParent();
+  std::string KernelDynLDSName = "llvm.amdgcn.";
+  KernelDynLDSName += F.getName();
+  KernelDynLDSName += ".dynlds";
+  return M->getNamedGlobal(KernelDynLDSName);
+}
+
 // This kernel calls no functions that require the module lds struct
 static bool canElideModuleLDS(const Function &F) {
   return F.hasFnAttribute("amdgpu-elide-module-lds");
@@ -131,11 +140,12 @@
 
     const GlobalVariable *GV = M->getNamedGlobal(ModuleLDSName);
     const GlobalVariable *KV = getKernelLDSGlobalFromFunction(F);
+    const GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(F);
 
     if (GV && !canElideModuleLDS(F)) {
       unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV, Align());
       std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*GV);
-      if (!Expect || (Offset != Expect)) {
+      if (!Expect || (Offset != *Expect)) {
         report_fatal_error("Inconsistent metadata on module LDS variable");
       }
     }
@@ -145,10 +155,23 @@
       // before any other non-module LDS variables.
       unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *KV, Align());
       std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*KV);
-      if (!Expect || (Offset != Expect)) {
+      if (!Expect || (Offset != *Expect)) {
         report_fatal_error("Inconsistent metadata on kernel LDS variable");
       }
     }
+
+    if (Dyn) {
+      // The dynamic LDS is deterministic because the per-kernel one has the
+      // maximum alignment of any reachable and all remaining LDS variables,
+      // if this is present, are themselves dynamic LDS and will be allocated
+      // at the same address.
+      setDynLDSAlign(F, *Dyn);
+      unsigned Offset = LDSSize;
+      std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*Dyn);
+      if (!Expect || (Offset != *Expect)) {
+        report_fatal_error("Inconsistent metadata on dynamic LDS variable");
+      }
+    }
   }
 }
 
@@ -187,8 +210,10 @@
   return {};
 }
 
-void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL,
+void AMDGPUMachineFunction::setDynLDSAlign(const Function &F,
                                            const GlobalVariable &GV) {
+  const Module *M = F.getParent();
+  const DataLayout &DL = M->getDataLayout();
   assert(DL.getTypeAllocSize(GV.getValueType()).isZero());
 
   Align Alignment =
@@ -198,4 +223,17 @@
 
   LDSSize = alignTo(StaticLDSSize, Alignment);
   DynLDSAlign = Alignment;
+
+  // If there is a dynamic LDS variable associated with this function F, every
+  // further dynamic LDS instance (allocated by calling setDynLDSAlign) must
+  // map to the same address. This holds because no LDS is allocated after the
+  // lowering pass if there are dynamic LDS variables present.
+  const GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(F);
+  if (Dyn) {
+    unsigned Offset = LDSSize; // return this?
+    std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*Dyn);
+    if (!Expect || (Offset != *Expect)) {
+      report_fatal_error("Inconsistent metadata on dynamic LDS variable");
+    }
+  }
 }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6139,7 +6139,8 @@
       if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
         assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
         // Adjust alignment for that dynamic shared memory array.
-        MFI->setDynLDSAlign(DAG.getDataLayout(), *cast<GlobalVariable>(GV));
+        Function &F = DAG.getMachineFunction().getFunction();
+        MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
         return SDValue(
             DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
       }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -79,24 +79,23 @@
   if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
     return false;
   }
+  if (GV.isConstant()) {
+    // A constant undef variable can't be written to, and any load is
+    // undef, so it should be eliminated by the optimizer. It could be
+    // dropped by the back end if not. This pass skips over it.
+    return false;
+  }
   if (!GV.hasInitializer()) {
     // addrspace(3) without initializer implies cuda/hip extern __shared__
     // the semantics for such a variable appears to be that all extern
-    // __shared__ variables alias one another, in which case this transform
-    // is not required
-    return false;
+    // __shared__ variables alias one another. This hits different handling.
+    return true;
   }
   if (!isa<UndefValue>(GV.getInitializer())) {
     // Initializers are unimplemented for LDS address space.
     // Leave such variables in place for consistent error reporting.
     return false;
   }
-  if (GV.isConstant()) {
-    // A constant undef variable can't be written to, and any load is
-    // undef, so it should be eliminated by the optimizer. It could be
-    // dropped by the back end if not. This pass skips over it.
-    return false;
-  }
   return true;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
--- a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
+++ b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
@@ -135,25 +135,4 @@
   ret void
 }
 
-; CHECK-LABEL: dynamic_shared_array_with_call:
-; CHECK-NOT: s_swappc_b64
-define amdgpu_kernel void @dynamic_shared_array_with_call(ptr addrspace(1) nocapture readnone %out) local_unnamed_addr {
-  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %1 = sext i32 %tid.x to i64
-  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds0, i64 0, i64 %1
-  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
-  tail call void @store_value(float %val0)
-  ret void
-}
-
-; CHECK-NOT: store_value
-define linkonce_odr hidden void @store_value(float %val1) local_unnamed_addr {
-entry:
-  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %0 = sext i32 %tid.x to i64
-  %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @dynamic_shared0, i64 0, i64 %0
-  store float %val1, ptr addrspace(3) %arrayidx1, align 4
-  ret void
-}
-
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -34,7 +34,64 @@
 @extern_normal = external addrspace(3) global [0 x float]
 @extern_overalign = external addrspace(3) global [0 x float], align 8
 
-; 2^3 cases encoded into function names
+
+; External LDS does not influence the frame when called indirectly either
+define void @use_extern_normal() #0 {
+; CHECK-LABEL: use_extern_normal:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.dynlds.offset.table@rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.dynlds.offset.table@rel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s4, s15
+; CHECK-NEXT:    s_ashr_i32 s5, s15, 31
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x4048f5c3
+; CHECK-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; CHECK-NEXT:    s_add_u32 s4, s4, s6
+; CHECK-NEXT:    s_addc_u32 s5, s5, s7
+; CHECK-NEXT:    s_load_dword s4, s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s4
+; CHECK-NEXT:    ds_write_b32 v1, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %arrayidx = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_normal, i32 0, i32 0
+  store float 0x40091EB860000000, ptr addrspace(3) %arrayidx
+  ret void
+}
+
+define void @use_extern_overalign() #0 {
+; CHECK-LABEL: use_extern_overalign:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.dynlds.offset.table@rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.dynlds.offset.table@rel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s4, s15
+; CHECK-NEXT:    s_ashr_i32 s5, s15, 31
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x42280000
+; CHECK-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; CHECK-NEXT:    s_add_u32 s4, s4, s6
+; CHECK-NEXT:    s_addc_u32 s5, s5, s7
+; CHECK-NEXT:    s_load_dword s4, s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s4
+; CHECK-NEXT:    ds_write_b32 v1, v0 offset:4
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %arrayidx = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_overalign, i32 0, i32 1
+  store float 4.200000e+01, ptr addrspace(3) %arrayidx
+  ret void
+}
+
+
+; First 2^3 of 2^4 cases encoded into function names
+; no use of extern variable from nested function
+; module_variable used/not-used
+; kernel variable normal/overaligned
+; extern variable normal/overaligned
 
 define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) #1 {
 ; CHECK-LABEL: module_0_kernel_normal_extern_normal:
@@ -268,5 +325,281 @@
   ret void
 }
 
+
+;; Second 2^3 of 2^4 cases encoded into function names
+; with extern variable from nested function
+; module_variable used/not-used
+; kernel variable normal/overaligned
+; extern variable normal/overaligned
+
+define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %idx) #1 {
+; CHECK-LABEL: module_0_kernel_normal_indirect_extern_normal:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_mov_b32 s15, 0
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; CHECK-NEXT:    s_endpgm
+  store i16 2, ptr addrspace(3) @kernel_normal
+
+  call void @use_extern_normal()
+  ret void
+}
+
+define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %idx) {
+; CHECK-LABEL: module_1_kernel_normal_indirect_extern_normal:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_module@gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_extern_normal@gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_extern_normal@gotpcrel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 2
+; CHECK-NEXT:    s_mov_b32 s15, 4
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    ds_write_b16 v0, v2 offset:2
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_endpgm
+  call void @use_module()
+  store i16 1, ptr addrspace(3) @module_variable
+
+  store i16 2, ptr addrspace(3) @kernel_normal
+
+  call void @use_extern_normal()
+  ret void
+}
+
+define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 %idx) #1 {
+; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_normal:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_mov_b32 s15, 2
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; CHECK-NEXT:    s_endpgm
+  store i16 2, ptr addrspace(3) @kernel_overalign
+
+  call void @use_extern_normal()
+  ret void
+}
+
+define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 %idx) {
+; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_normal:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_module@gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_extern_normal@gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_extern_normal@gotpcrel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 2
+; CHECK-NEXT:    s_mov_b32 s15, 6
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    ds_write_b16 v0, v2 offset:4
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_endpgm
+  call void @use_module()
+  store i16 1, ptr addrspace(3) @module_variable
+
+  store i16 2, ptr addrspace(3) @kernel_overalign
+
+  call void @use_extern_normal()
+  ret void
+}
+
+define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 %idx) #1 {
+; CHECK-LABEL: module_0_kernel_normal_indirect_extern_overalign:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_mov_b32 s15, 1
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; CHECK-NEXT:    s_endpgm
+  store i16 2, ptr addrspace(3) @kernel_normal
+
+  call void @use_extern_overalign()
+  ret void
+}
+
+define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 %idx) {
+; CHECK-LABEL: module_1_kernel_normal_indirect_extern_overalign:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_module@gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_extern_overalign@gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_extern_overalign@gotpcrel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 2
+; CHECK-NEXT:    s_mov_b32 s15, 5
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    ds_write_b16 v0, v2 offset:2
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_endpgm
+  call void @use_module()
+  store i16 1, ptr addrspace(3) @module_variable
+
+  store i16 2, ptr addrspace(3) @kernel_normal
+
+  call void @use_extern_overalign()
+  ret void
+}
+
+define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i32 %idx) #1 {
+; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_overalign:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_mov_b32 s15, 3
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; CHECK-NEXT:    s_endpgm
+  store i16 2, ptr addrspace(3) @kernel_overalign
+
+  call void @use_extern_overalign()
+  ret void
+}
+
+define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i32 %idx) {
+; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_overalign:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_module@gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_extern_overalign@gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_extern_overalign@gotpcrel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 2
+; CHECK-NEXT:    s_mov_b32 s15, 7
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    ds_write_b16 v0, v2 offset:4
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_endpgm
+  call void @use_module()
+  store i16 1, ptr addrspace(3) @module_variable
+
+  store i16 2, ptr addrspace(3) @kernel_overalign
+
+  call void @use_extern_overalign()
+  ret void
+}
+
+
 attributes #0 = { noinline }
 attributes #1 = { "amdgpu-elide-module-lds" }
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
@@ -18,10 +18,6 @@
 @const_undef = addrspace(3) constant i32 undef
 @const_with_init = addrspace(3) constant i64 8
 
-; External and constant are both left to the optimizer / error diagnostics
-; CHECK: @extern = external addrspace(3) global i32
-@extern = external addrspace(3) global i32
-
 ; Use of an addrspace(3) variable with an initializer is skipped,
 ; so as to preserve the unimplemented error from llc
 ; CHECK: @with_init = addrspace(3) global i64 0
@@ -47,14 +43,12 @@
 ; CHECK: %c0 = load i32, ptr addrspace(3) @const_undef, align 4
 ; CHECK: %c1 = load i64, ptr addrspace(3) @const_with_init, align 4
 ; CHECK: %v0 = atomicrmw add ptr addrspace(3) @with_init, i64 1 seq_cst
-; CHECK: %v1 = cmpxchg ptr addrspace(3) @extern, i32 4, i32 %c0 acq_rel monotonic
-; CHECK: %v2 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic
+; CHECK: %v1 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic
 define void @use_variables() {
   %c0 = load i32, ptr addrspace(3) @const_undef, align 4
   %c1 = load i64, ptr addrspace(3) @const_with_init, align 4
   %v0 = atomicrmw add ptr addrspace(3) @with_init, i64 1 seq_cst
-  %v1 = cmpxchg ptr addrspace(3) @extern, i32 4, i32 %c0 acq_rel monotonic
-  %v2 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic
+  %v1 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+
+; Not reached by a non-kernel function and therefore not changed by this pass
+@dynamic_kernel_only = external addrspace(3) global [0 x double]
+
+; shared8 is used directly by a kernel so remains in the outbout
+; the other three are only used by functions and will be replaced with
+; table lookups and dropped from the IR
+@dynamic_shared1 = external addrspace(3) global [0 x i8], align 1
+@dynamic_shared2 = external addrspace(3) global [0 x i16], align 2
+@dynamic_shared4 = external addrspace(3) global [0 x i32], align 4
+@dynamic_shared8 = external addrspace(3) global [0 x i64], align 8
+
+; CHECK: %llvm.amdgcn.module.lds.t = type { i32 }
+; CHECK: @dynamic_kernel_only = external addrspace(3) global [0 x double]
+; CHECK: @dynamic_shared8 = external addrspace(3) global [0 x i64], align 8
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4, !absolute_symbol !0
+; CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+
+; Alignment of these must be the maximum of the alignment of the reachable symbols
+; CHECK: @llvm.amdgcn.expect_align1.dynlds = external addrspace(3) global [0 x i8], align 1, !absolute_symbol !0
+; CHECK: @llvm.amdgcn.expect_align2.dynlds = external addrspace(3) global [0 x i8], align 2, !absolute_symbol !0
+; CHECK: @llvm.amdgcn.expect_align4.dynlds = external addrspace(3) global [0 x i8], align 4, !absolute_symbol !1
+; CHECK: @llvm.amdgcn.expect_align8.dynlds = external addrspace(3) global [0 x i8], align 8, !absolute_symbol !0
+
+; Align 4 and symbol at address [4 5) as module.lds is reachable
+; CHECK: @llvm.amdgcn.expect_max_of_2_and_4.dynlds = external addrspace(3) global [0 x i8], align 4, !absolute_symbol !1
+
+; Builds a lookup table out of the newly created (suffixed .dynlds) variables in kernel.id order
+; CHECK: @llvm.amdgcn.dynlds.offset.table = internal addrspace(4) constant [5 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_align1.dynlds to i32), i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_align2.dynlds to i32), i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_align4.dynlds to i32), i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_align8.dynlds to i32), i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_max_of_2_and_4.dynlds to i32)]
+
+
+
+define amdgpu_kernel void @kernel_only() {
+; CHECK-LABEL: @kernel_only() #0 {
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x double], ptr addrspace(3) @dynamic_kernel_only, i32 0, i32 0
+; CHECK-NEXT:    store double 3.140000e+00, ptr addrspace(3) [[ARRAYIDX]], align 8
+; CHECK-NEXT:    ret void
+;
+  %arrayidx = getelementptr inbounds [0 x double], ptr addrspace(3) @dynamic_kernel_only, i32 0, i32 0
+  store double 3.140000e+00, ptr addrspace(3) %arrayidx
+  ret void
+}
+
+; The accesses from functions are rewritten to go through the llvm.amdgcn.dynlds.offset.table
+define void @use_shared1() #0 {
+; CHECK-LABEL: @use_shared1() #1 {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT:    [[DYNAMIC_SHARED1:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED1]], align 4
+; CHECK-NEXT:    [[DYNAMIC_SHARED11:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr addrspace(3) [[DYNAMIC_SHARED11]], i32 0, i32 1
+; CHECK-NEXT:    store i8 0, ptr addrspace(3) [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret void
+;
+  %arrayidx = getelementptr inbounds [0 x i8], ptr addrspace(3) @dynamic_shared1, i32 0, i32 1
+  store i8 0, ptr addrspace(3) %arrayidx
+  ret void
+}
+
+define void @use_shared2() #0 {
+; CHECK-LABEL: @use_shared2() #1 {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT:    [[DYNAMIC_SHARED2:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED2]], align 4
+; CHECK-NEXT:    [[DYNAMIC_SHARED21:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i16], ptr addrspace(3) [[DYNAMIC_SHARED21]], i32 0, i32 3
+; CHECK-NEXT:    store i16 1, ptr addrspace(3) [[ARRAYIDX]], align 2
+; CHECK-NEXT:    ret void
+;
+  %arrayidx = getelementptr inbounds [0 x i16], ptr addrspace(3) @dynamic_shared2, i32 0, i32 3
+  store i16 1, ptr addrspace(3) %arrayidx
+  ret void
+}
+
+; Include a normal variable so that the new variables aren't all at the same absolute_symbol
+@static_shared = addrspace(3) global i32 undef
+define void @use_shared4() #0 {
+; CHECK-LABEL: @use_shared4() #1 {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT:    store i32 4, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4
+; CHECK-NEXT:    [[DYNAMIC_SHARED4:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED4]], align 4
+; CHECK-NEXT:    [[DYNAMIC_SHARED41:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr addrspace(3) [[DYNAMIC_SHARED41]], i32 0, i32 5
+; CHECK-NEXT:    store i32 2, ptr addrspace(3) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    ret void
+;
+  store i32 4, ptr addrspace(3) @static_shared
+  %arrayidx = getelementptr inbounds [0 x i32], ptr addrspace(3) @dynamic_shared4, i32 0, i32 5
+  store i32 2, ptr addrspace(3) %arrayidx
+  ret void
+}
+
+define void @use_shared8() #0 {
+; CHECK-LABEL: @use_shared8() #1 {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT:    [[DYNAMIC_SHARED8:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED8]], align 4
+; CHECK-NEXT:    [[DYNAMIC_SHARED81:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], ptr addrspace(3) [[DYNAMIC_SHARED81]], i32 0, i32 7
+; CHECK-NEXT:    store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    ret void
+;
+  %arrayidx = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 7
+  store i64 3, ptr addrspace(3) %arrayidx
+  ret void
+}
+
+; The kernels are annotated with kernel.id and llvm.donothing use of the corresponding variable
+define amdgpu_kernel void @expect_align1() {
+; CHECK-LABEL: @expect_align1() #0 !llvm.amdgcn.lds.kernel.id !2
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align1.dynlds) ]
+; CHECK-NEXT:    call void @use_shared1()
+; CHECK-NEXT:    ret void
+;
+  call void @use_shared1()
+  ret void
+}
+
+define amdgpu_kernel void @expect_align2() {
+; CHECK-LABEL: @expect_align2() #0 !llvm.amdgcn.lds.kernel.id !3
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align2.dynlds) ]
+; CHECK-NEXT:    call void @use_shared2()
+; CHECK-NEXT:    ret void
+;
+  call void @use_shared2()
+  ret void
+}
+
+define amdgpu_kernel void @expect_align4() {
+; CHECK-LABEL: @expect_align4() !llvm.amdgcn.lds.kernel.id !4 {
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align4.dynlds) ]
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
+; CHECK-NEXT:    call void @use_shared4()
+; CHECK-NEXT:    ret void
+;
+  call void @use_shared4()
+  ret void
+}
+
+; Use dynamic_shared directly too. Can elide module lds (#0) 
+define amdgpu_kernel void @expect_align8() {
+; CHECK-LABEL: @expect_align8() #0 !llvm.amdgcn.lds.kernel.id !5 {
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align8.dynlds) ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 9
+; CHECK-NEXT:    store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    call void @use_shared8()
+; CHECK-NEXT:    ret void
+;
+  %arrayidx = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 9
+  store i64 3, ptr addrspace(3) %arrayidx
+  call void @use_shared8()
+  ret void
+}
+
+; Note: use_shared4 uses module.lds so this will allocate at offset 4
+define amdgpu_kernel void @expect_max_of_2_and_4() {
+; CHECK-LABEL: @expect_max_of_2_and_4() !llvm.amdgcn.lds.kernel.id !6 {
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_max_of_2_and_4.dynlds) ]
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
+; CHECK-NEXT:    call void @use_shared2()
+; CHECK-NEXT:    call void @use_shared4()
+; CHECK-NEXT:    ret void
+;
+  call void @use_shared2()
+  call void @use_shared4()
+  ret void
+}
+
+
+attributes #0 = { noinline }
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+; CHECK: declare void @llvm.donothing() #2
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+; CHECK: declare i32 @llvm.amdgcn.lds.kernel.id() #3
+
+; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }
+; CHECK: attributes #1 = { noinline }
+; CHECK: attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+; CHECK: !0 = !{i64 0, i64 1}
+; CHECK: !1 = !{i64 4, i64 5}
+; CHECK: !2 = !{i32 0}
+; CHECK: !3 = !{i32 1}
+; CHECK: !4 = !{i32 2}
+; CHECK: !5 = !{i32 3}
+; CHECK: !6 = !{i32 4}