diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -1093,6 +1093,9 @@
           auto *I = dyn_cast<Instruction>(U.getUser());
           if (!I)
             continue;
+
+          // Fixme: This means not all dyn lds has an abs symbol by the end
+          // of this pass
           if (isKernelLDS(I->getFunction()))
             continue;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -882,86 +882,28 @@
   if (LocalMemLimit == 0)
     return false;
 
-  SmallVector<const Constant *, 16> Stack;
-  SmallPtrSet<const Constant *, 8> VisitedConstants;
-  SmallPtrSet<const GlobalVariable *, 8> UsedLDS;
-
-  auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
-    for (const User *U : Val->users()) {
-      if (const Instruction *Use = dyn_cast<Instruction>(U)) {
-        if (Use->getParent()->getParent() == &F)
-          return true;
-      } else {
-        const Constant *C = cast<Constant>(U);
-        if (VisitedConstants.insert(C).second)
-          Stack.push_back(C);
-      }
-    }
-
-    return false;
-  };
-
-  for (GlobalVariable &GV : Mod->globals()) {
-    if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
-      continue;
-
-    if (visitUsers(&GV, &GV)) {
-      UsedLDS.insert(&GV);
-      Stack.clear();
-      continue;
-    }
-
-    // For any ConstantExpr uses, we need to recursively search the users until
-    // we see a function.
-    while (!Stack.empty()) {
-      const Constant *C = Stack.pop_back_val();
-      if (visitUsers(&GV, C)) {
-        UsedLDS.insert(&GV);
-        Stack.clear();
-        break;
-      }
-    }
-  }
+  // LowerModuleToLDS runs unconditionally before this pass. The two calling
+  // conventions that pass the check in tryPromoteAllocaToLDS are {AMDGPU,SPIR}
+  // KERNEL, both of which are rewritten by LowerModuleToLDS.
 
-  const DataLayout &DL = Mod->getDataLayout();
-  SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
-  AllocatedSizes.reserve(UsedLDS.size());
+  std::pair<unsigned, unsigned> LDSSizeRange = AMDGPU::getIntegerPairAttribute(
+      F, "amdgpu-lds-size", {0, UINT32_MAX}, true);
 
-  for (const GlobalVariable *GV : UsedLDS) {
-    Align Alignment =
-        DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
-    uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
+  if (LDSSizeRange.first >= LDSSizeRange.second) {
+    // Then no space is available to allocate. Currently only the case if
+    // there is externally allocated LDS in use by the kernel.
 
     // HIP uses an extern unsized array in local address space for dynamically
     // allocated shared memory.  In that case, we have to disable the promotion.
-    if (GV->hasExternalLinkage() && AllocSize == 0) {
-      LocalMemLimit = 0;
-      LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
-                           "local memory. Promoting to local memory "
-                           "disabled.\n");
-      return false;
-    }
-
-    AllocatedSizes.emplace_back(AllocSize, Alignment);
+    LocalMemLimit = 0;
+    LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
+               "local memory. Promoting to local memory "
+               "disabled.\n");
+    return false;
   }
 
-  // Sort to try to estimate the worst case alignment padding
-  //
-  // FIXME: We should really do something to fix the addresses to a more optimal
-  // value instead
-  llvm::sort(AllocatedSizes, llvm::less_second());
-
-  // Check how much local memory is being used by global objects
-  CurrentLocalMemUsage = 0;
-
-  // FIXME: Try to account for padding here. The real padding and address is
-  // currently determined from the inverse order of uses in the function when
-  // legalizing, which could also potentially change. We try to estimate the
-  // worst case here, but we probably should fix the addresses earlier.
-  for (auto Alloc : AllocatedSizes) {
-    CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
-    CurrentLocalMemUsage += Alloc.first;
-  }
+  CurrentLocalMemUsage = LDSSizeRange.first;
+  LocalMemLimit = std::min(LocalMemLimit, LDSSizeRange.second);
 
   unsigned MaxOccupancy =
       ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, F);
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
deleted file mode 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
+++ /dev/null
@@ -1,132 +0,0 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -disable-promote-alloca-to-vector -amdgpu-enable-lower-module-lds=0 < %s | FileCheck -check-prefix=GCN %s
-
-; This shows that the amount LDS size estimate should try to not be
-; sensitive to the order of the LDS globals. This should try to
-; estimate the worst case padding behavior to avoid overallocating
-; LDS.
-
-; These functions use the same amount of LDS, but the total, final
-; size changes depending on the visit order of first use.
-
-; The one with the suboptimal order resulting in extra padding exceeds
-; the desired limit
-
-; The padding estimate heuristic used by the promote alloca pass
-; is mostly determined by the order of the globals,
-
-; Raw usage = 1060 bytes
-; Rounded usage:
-; 292 + (4 pad) + 256 + (8 pad) + 512 = 1072
-; 512 + (0 pad) + 256 + (0 pad) + 292 = 1060
-
-; At default occupancy guess of 7, 2340 bytes available total.
-
-; 1280 need to be left to promote alloca
-; optimally packed, this requires
-
-
-@lds0 = internal unnamed_addr addrspace(3) global [32 x <4 x i32>] undef, align 16
-@lds2 = internal unnamed_addr addrspace(3) global [32 x i64] undef, align 8
-@lds1 = internal unnamed_addr addrspace(3) global [73 x i32] undef, align 4
-
-
-; GCN-LABEL: {{^}}promote_alloca_size_order_0:
-; GCN: workgroup_group_segment_byte_size = 1060
-define amdgpu_kernel void @promote_alloca_size_order_0(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in, i32 %idx) #0 {
-entry:
-  %stack = alloca [5 x i32], align 4, addrspace(5)
-  %tmp0 = load i32, ptr addrspace(1) %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp0
-  store i32 4, ptr addrspace(5) %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
-  %tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1
-  store i32 5, ptr addrspace(5) %arrayidx3, align 4
-  %tmp2 = load i32, ptr addrspace(5) %stack, align 4
-  store i32 %tmp2, ptr addrspace(1) %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
-  %tmp3 = load i32, ptr addrspace(5) %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
-  store i32 %tmp3, ptr addrspace(1) %arrayidx13
-
-  %gep.lds1 = getelementptr inbounds [73 x i32], ptr addrspace(3) @lds1, i32 0, i32 %idx
-  store volatile i32 0, ptr addrspace(3) %gep.lds1, align 4
-
-  %gep.lds2 = getelementptr inbounds [32 x i64], ptr addrspace(3) @lds2, i32 0, i32 %idx
-  store volatile i64 0, ptr addrspace(3) %gep.lds2, align 8
-
-  %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], ptr addrspace(3) @lds0, i32 0, i32 %idx
-  store volatile <4 x i32> zeroinitializer, ptr addrspace(3) %gep.lds0, align 16
-
-  ret void
-}
-
-; GCN-LABEL: {{^}}promote_alloca_size_order_1:
-; GCN: workgroup_group_segment_byte_size = 1072
-define amdgpu_kernel void @promote_alloca_size_order_1(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in, i32 %idx) #0 {
-entry:
-  %stack = alloca [5 x i32], align 4, addrspace(5)
-  %tmp0 = load i32, ptr addrspace(1) %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp0
-  store i32 4, ptr addrspace(5) %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
-  %tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1
-  store i32 5, ptr addrspace(5) %arrayidx3, align 4
-  %tmp2 = load i32, ptr addrspace(5) %stack, align 4
-  store i32 %tmp2, ptr addrspace(1) %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
-  %tmp3 = load i32, ptr addrspace(5) %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
-  store i32 %tmp3, ptr addrspace(1) %arrayidx13
-
-  %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], ptr addrspace(3) @lds0, i32 0, i32 %idx
-  store volatile <4 x i32> zeroinitializer, ptr addrspace(3) %gep.lds0, align 16
-
-  %gep.lds2 = getelementptr inbounds [32 x i64], ptr addrspace(3) @lds2, i32 0, i32 %idx
-  store volatile i64 0, ptr addrspace(3) %gep.lds2, align 8
-
-  %gep.lds1 = getelementptr inbounds [73 x i32], ptr addrspace(3) @lds1, i32 0, i32 %idx
-  store volatile i32 0, ptr addrspace(3) %gep.lds1, align 4
-
-  ret void
-}
-
-@lds3 = internal unnamed_addr addrspace(3) global [13 x i32] undef, align 4
-@lds4 = internal unnamed_addr addrspace(3) global [63 x <4 x i32>] undef, align 16
-
-; The guess from the alignment padding pushes this over the determined
-; size limit, so it isn't promoted
-
-; GCN-LABEL: {{^}}promote_alloca_align_pad_guess_over_limit:
-; GCN: workgroup_group_segment_byte_size = 1060
-define amdgpu_kernel void @promote_alloca_align_pad_guess_over_limit(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in, i32 %idx) #0 {
-entry:
-  %stack = alloca [5 x i32], align 4, addrspace(5)
-  %tmp0 = load i32, ptr addrspace(1) %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp0
-  store i32 4, ptr addrspace(5) %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
-  %tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1
-  store i32 5, ptr addrspace(5) %arrayidx3, align 4
-  %tmp2 = load i32, ptr addrspace(5) %stack, align 4
-  store i32 %tmp2, ptr addrspace(1) %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
-  %tmp3 = load i32, ptr addrspace(5) %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
-  store i32 %tmp3, ptr addrspace(1) %arrayidx13
-
-  %gep.lds3 = getelementptr inbounds [13 x i32], ptr addrspace(3) @lds3, i32 0, i32 %idx
-  store volatile i32 0, ptr addrspace(3) %gep.lds3, align 4
-
-  %gep.lds4 = getelementptr inbounds [63 x <4 x i32>], ptr addrspace(3) @lds4, i32 0, i32 %idx
-  store volatile <4 x i32> zeroinitializer, ptr addrspace(3) %gep.lds4, align 16
-
-  ret void
-}
-
-attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,7" }
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 200}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s
-; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=ASM %s
+; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=ASM %s
 
 target datalayout = "A5"
 
@@ -155,53 +155,4 @@
   ret void
 }
 
-; IR-LABEL: @constant_expression_uses_some_lds_global_initializer(
-; IR-NOT: alloca
-; IR: llvm.amdgcn.workitem.id
-
-; ASM-LABEL: {{^}}constant_expression_uses_some_lds_global_initializer:
-; ASM: .amdhsa_group_segment_fixed_size 4096{{$}}
-define amdgpu_kernel void @constant_expression_uses_some_lds_global_initializer(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
-entry:
-  %stack = alloca [4 x i32], align 4, addrspace(5)
-  %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
-  %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
-  %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
-  store i32 9, ptr addrspace(5) %stack
-  store i32 10, ptr addrspace(5) %gep1
-  store i32 99, ptr addrspace(5) %gep2
-  store i32 43, ptr addrspace(5) %gep3
-  %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
-  %load = load i32, ptr addrspace(5) %arrayidx, align 4
-  store i32 %load, ptr addrspace(1) %out
-
-  store volatile i32 ptrtoint (ptr addrspace(1) @initializer_user_some to i32), ptr addrspace(1) undef
-  ret void
-}
-
-; We can't actually handle LDS initializers in global initializers,
-; but this should count as usage.
-
-; IR-LABEL: @constant_expression_uses_all_lds_global_initializer(
-; IR: alloca
-
-; ASM-LABEL: {{^}}constant_expression_uses_all_lds_global_initializer:
-; ASM: .group_segment_fixed_size: 65536
-define amdgpu_kernel void @constant_expression_uses_all_lds_global_initializer(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
-entry:
-  %stack = alloca [4 x i32], align 4, addrspace(5)
-  %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
-  %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
-  %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
-  store i32 9, ptr addrspace(5) %stack
-  store i32 10, ptr addrspace(5) %gep1
-  store i32 99, ptr addrspace(5) %gep2
-  store i32 43, ptr addrspace(5) %gep3
-  %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
-  %load = load i32, ptr addrspace(5) %arrayidx, align 4
-  store i32 %load, ptr addrspace(1) %out
-  store volatile i32 ptrtoint (ptr addrspace(1) @initializer_user_all to i32), ptr addrspace(1) undef
-  ret void
-}
-
 attributes #0 = { "amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="256,256" }