Index: llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -749,41 +749,79 @@
   if (LocalMemLimit == 0)
     return false;
 
-  const DataLayout &DL = Mod->getDataLayout();
+  SmallVector<const Constant *, 16> Stack;
+  SmallPtrSet<const Constant *, 8> VisitedConstants;
+  SmallPtrSet<const GlobalVariable *, 8> UsedLDS;
+
+  auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
+    for (const User *U : Val->users()) {
+      if (const Instruction *Use = dyn_cast<Instruction>(U)) {
+        if (Use->getParent()->getParent() == &F)
+          return true;
+      } else {
+        const Constant *C = cast<Constant>(U);
+        if (VisitedConstants.insert(C).second)
+          Stack.push_back(C);
+      }
+    }
+
+    return false;
+  };
 
-  // Check how much local memory is being used by global objects
-  CurrentLocalMemUsage = 0;
   for (GlobalVariable &GV : Mod->globals()) {
     if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
       continue;
 
-    for (const User *U : GV.users()) {
-      const Instruction *Use = dyn_cast<Instruction>(U);
-      if (!Use) {
-        // FIXME: This is probably a constant expression use. We should
-        // recursively search the users of it for the parent function instead of
-        // bailing.
-        LLVM_DEBUG(dbgs() << "Giving up on LDS size estimate "
-                             "due to constant expression\n");
-        return false;
-      }
+    if (visitUsers(&GV, &GV)) {
+      UsedLDS.insert(&GV);
+      Stack.clear();
+      continue;
+    }
 
-      if (Use->getParent()->getParent() == &F) {
-        Align Alignment =
-            DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
-
-        // FIXME: Try to account for padding here. The padding is currently
-        // determined from the inverse order of uses in the function. I'm not
-        // sure if the use list order is in any way connected to this, so the
-        // total reported size is likely incorrect.
-        uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
-        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alignment);
-        CurrentLocalMemUsage += AllocSize;
+    // For any ConstantExpr uses, we need to recursively search the users until
+    // we see a function.
+    while (!Stack.empty()) {
+      const Constant *C = Stack.pop_back_val();
+      if (visitUsers(&GV, C)) {
+        UsedLDS.insert(&GV);
+        Stack.clear();
         break;
       }
     }
   }
 
+  const DataLayout &DL = Mod->getDataLayout();
+  SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
+  AllocatedSizes.reserve(UsedLDS.size());
+
+  for (const GlobalVariable *GV : UsedLDS) {
+    Align Alignment =
+        DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
+    uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
+    AllocatedSizes.emplace_back(AllocSize, Alignment);
+  }
+
+  // Sort to try to estimate the worst case alignment padding
+  //
+  // FIXME: We should really do something to fix the addresses to a more optimal
+  // value instead
+  std::sort(AllocatedSizes.begin(), AllocatedSizes.end(),
+            [](std::pair<uint64_t, Align> LHS, std::pair<uint64_t, Align> RHS) {
+              return LHS.second < RHS.second;
+            });
+
+  // Check how much local memory is being used by global objects
+  CurrentLocalMemUsage = 0;
+
+  // FIXME: Try to account for padding here. The real padding and address is
+  // currently determined from the inverse order of uses in the function when
+  // legalizing, which could also potentially change. We try to estimate the
+  // worst case here, but we probably should fix the addresses earlier.
+  for (auto Alloc : AllocatedSizes) {
+    CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
+    CurrentLocalMemUsage += Alloc.first;
+  }
+
   unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
                                                           F);
 
Index: llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
+++ llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
@@ -1,10 +1,12 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=GCN %s
 
-; This shows that the amount of LDS estimate is sensitive to the order
-; of the LDS globals.
+; This shows that the amount LDS size estimate should try to not be
+; sensitive to the order of the LDS globals. This should try to
+; estimate the worst case padding behavior to avoid overallocating
+; LDS.
 
-; Both of these functions use the same amount of LDS, but the total
-; changes depending on the visit order of first use.
+; These functions use the same amount of LDS, but the total, final
+; size changes depending on the visit order of first use.
 
 ; The one with the suboptimal order resulting in extra padding exceeds
 ; the desired limit
@@ -29,7 +31,7 @@
 
 
 ; GCN-LABEL: {{^}}promote_alloca_size_order_0:
-; GCN: workgroup_group_segment_byte_size = 2340
+; GCN: workgroup_group_segment_byte_size = 1060
 define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
 entry:
   %stack = alloca [5 x i32], align 4, addrspace(5)
@@ -61,7 +63,7 @@
 }
 
 ; GCN-LABEL: {{^}}promote_alloca_size_order_1:
-; GCN: workgroup_group_segment_byte_size = 2352
+; GCN: workgroup_group_segment_byte_size = 1072
 define amdgpu_kernel void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
 entry:
   %stack = alloca [5 x i32], align 4, addrspace(5)
Index: llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
+++ llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
@@ -4,16 +4,20 @@
 target datalayout = "A5"
 
 @all_lds = internal unnamed_addr addrspace(3) global [16384 x i32] undef, align 4
+@some_lds = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4
+
+@initializer_user_some = addrspace(1) global i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), align 4
+@initializer_user_all = addrspace(1) global i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), align 4
 
 ; This function cannot promote to using LDS because of the size of the
 ; constant expression use in the function, which was previously not
 ; detected.
-; IR-LABEL: @constant_expression_uses_lds(
+; IR-LABEL: @constant_expression_uses_all_lds(
 ; IR: alloca
 
-; ASM-LABEL: constant_expression_uses_lds:
-; ASM: .group_segment_fixed_size: 65536
-define amdgpu_kernel void @constant_expression_uses_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+; ASM-LABEL: constant_expression_uses_all_lds:
+; ASM: .amdhsa_group_segment_fixed_size 65536
+define amdgpu_kernel void @constant_expression_uses_all_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
 entry:
   %stack = alloca [4 x i32], align 4, addrspace(5)
   %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
@@ -32,4 +36,130 @@
   ret void
 }
 
-attributes #0 = { "amdgpu-waves-per-eu"="1,5" }
+; Has a constant expression use through a single level of constant
+; expression, but not enough LDS to block promotion
+
+; IR-LABEL: @constant_expression_uses_some_lds(
+; IR-NOT: alloca
+
+; ASM-LABEL: {{^}}constant_expression_uses_some_lds:
+; ASM: .amdhsa_group_segment_fixed_size 4224{{$}}
+define amdgpu_kernel void @constant_expression_uses_some_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+entry:
+  %stack = alloca [4 x i32], align 4, addrspace(5)
+  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
+  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
+  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
+  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
+  store i32 9, i32 addrspace(5)* %gep0
+  store i32 10, i32 addrspace(5)* %gep1
+  store i32 99, i32 addrspace(5)* %gep2
+  store i32 43, i32 addrspace(5)* %gep3
+  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
+  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
+  store i32 %load, i32 addrspace(1)* %out
+  store volatile i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), i32 addrspace(1)* undef
+  ret void
+}
+
+declare void @callee(i8*)
+
+; IR-LABEL: @constant_expression_uses_all_lds_multi_level(
+; IR: alloca
+
+; ASM-LABEL: {{^}}constant_expression_uses_all_lds_multi_level:
+; ASM: .amdhsa_group_segment_fixed_size 65536{{$}}
+define amdgpu_kernel void @constant_expression_uses_all_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+entry:
+  %stack = alloca [4 x i32], align 4, addrspace(5)
+  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
+  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
+  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
+  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
+  store i32 9, i32 addrspace(5)* %gep0
+  store i32 10, i32 addrspace(5)* %gep1
+  store i32 99, i32 addrspace(5)* %gep2
+  store i32 43, i32 addrspace(5)* %gep3
+  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
+  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
+  store i32 %load, i32 addrspace(1)* %out
+  call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([16384 x i32], [16384 x i32] addrspace(3)* @all_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*))
+  ret void
+}
+
+; IR-LABEL: @constant_expression_uses_some_lds_multi_level(
+; IR-NOT: alloca
+; IR: llvm.amdgcn.workitem.id
+
+; ASM-LABEL: {{^}}constant_expression_uses_some_lds_multi_level:
+; ASM: .amdhsa_group_segment_fixed_size 4224{{$}}
+define amdgpu_kernel void @constant_expression_uses_some_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+entry:
+  %stack = alloca [4 x i32], align 4, addrspace(5)
+  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
+  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
+  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
+  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
+  store i32 9, i32 addrspace(5)* %gep0
+  store i32 10, i32 addrspace(5)* %gep1
+  store i32 99, i32 addrspace(5)* %gep2
+  store i32 43, i32 addrspace(5)* %gep3
+  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
+  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
+  store i32 %load, i32 addrspace(1)* %out
+  call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @some_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*))
+  ret void
+}
+
+; IR-LABEL: @constant_expression_uses_some_lds_global_initializer(
+; IR-NOT: alloca
+; IR: llvm.amdgcn.workitem.id
+
+; ASM-LABEL: {{^}}constant_expression_uses_some_lds_global_initializer:
+; ASM: .amdhsa_group_segment_fixed_size 4096{{$}}
+define amdgpu_kernel void @constant_expression_uses_some_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+entry:
+  %stack = alloca [4 x i32], align 4, addrspace(5)
+  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
+  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
+  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
+  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
+  store i32 9, i32 addrspace(5)* %gep0
+  store i32 10, i32 addrspace(5)* %gep1
+  store i32 99, i32 addrspace(5)* %gep2
+  store i32 43, i32 addrspace(5)* %gep3
+  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
+  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
+  store i32 %load, i32 addrspace(1)* %out
+
+  store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_some to i32), i32 addrspace(1)* undef
+  ret void
+}
+
+; We can't actually handle LDS initializers in global initializers,
+; but this should count as usage.
+
+; IR-LABEL: @constant_expression_uses_all_lds_global_initializer(
+; IR: alloca
+
+; ASM-LABEL: {{^}}constant_expression_uses_all_lds_global_initializer:
+; ASM: .group_segment_fixed_size: 65536
+define amdgpu_kernel void @constant_expression_uses_all_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+entry:
+  %stack = alloca [4 x i32], align 4, addrspace(5)
+  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
+  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
+  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
+  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
+  store i32 9, i32 addrspace(5)* %gep0
+  store i32 10, i32 addrspace(5)* %gep1
+  store i32 99, i32 addrspace(5)* %gep2
+  store i32 43, i32 addrspace(5)* %gep3
+  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
+  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
+  store i32 %load, i32 addrspace(1)* %out
+  store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_all to i32), i32 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { "amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="256,256" }