diff --git a/llvm/include/llvm/Support/Alignment.h b/llvm/include/llvm/Support/Alignment.h
--- a/llvm/include/llvm/Support/Alignment.h
+++ b/llvm/include/llvm/Support/Alignment.h
@@ -345,6 +345,10 @@
   return Rhs && *Rhs > Lhs ? *Rhs : Lhs;
 }
 
+inline Align max(Align Lhs, Align Rhs) {
+  return Lhs > Rhs ? Lhs : Rhs;
+}
+
 #ifndef NDEBUG
 // For usage in LLVM_DEBUG macros.
 inline std::string DebugStr(const Align &A) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -9,11 +9,128 @@
 #include "AMDGPUMachineFunction.h"
 #include "AMDGPUPerfHintAnalysis.h"
 #include "AMDGPUSubtarget.h"
+#include "Utils/AMDGPULDSUtils.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
+// Fix natural alignment for LDS global `GV` based on its size.
+static Align fixAlignment(const DataLayout &DL, const GlobalVariable *GV,
+                          Align Alignment) {
+  TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType());
+
+  if (GVSize > 8) {
+    // We might want to use a b96 or b128 load/store
+    Alignment = max(Alignment, Align(16));
+  } else if (GVSize > 4) {
+    // We might want to use a b64 load/store
+    Alignment = max(Alignment, Align(8));
+  } else if (GVSize > 2) {
+    // We might want to use a b32 load/store
+    Alignment = max(Alignment, Align(4));
+  } else if (GVSize > 1) {
+    // We might want to use a b16 load/store
+    Alignment = max(Alignment, Align(2));
+  }
+
+  return Alignment;
+}
+
+// Check if the kernel K uses the LDS global GV.
+static bool hasKernelUsesLDS(const GlobalVariable *GV, const Function *K) {
+  SmallVector<const User *, 8> UserStack(GV->users());
+  SmallPtrSet<const User *, 8> VisitedUsers;
+
+  // Following are the possibilities of GV uses:
+  // 1. GV is directly used within an instruction as one of its operand.
+  // 2. GV is used within a constant expression, and this constant expression
+  //    itself is used either within an instruction or as an initiializer to
+  //    a global variable. Though later case is rare, nevertheless, we need to
+  //    check for it.
+  while (!UserStack.empty()) {
+    const auto *UU = UserStack.pop_back_val();
+
+    // No need to handle already visited users.
+    if (!VisitedUsers.insert(UU).second)
+      continue;
+
+    // Ignore possible, but, rare uses as in below example.
+    // @GV = addrspace(3) global int undef
+    // @GV2 =
+    //     addrspace(1) global int* addrspacecast(int addrspace(3)* @GV to int*)
+    if (isa<GlobalVariable>(UU))
+      continue;
+
+    // Recursively traverse through constant expressions.
+    if (isa<Constant>(UU)) {
+      append_range(UserStack, UU->users());
+      continue;
+    }
+
+    // At this point, user should be an instruction. Check if this instruction
+    // is from the kernel K, if so, return true.
+    const auto *I = dyn_cast<Instruction>(UU);
+    assert(I && "Expected an instruction");
+    if (I->getFunction() == K)
+      return true;
+  }
+
+  // GV is not used within kernel K. return false.
+  return false;
+}
+
+SmallVector<const GlobalVariable *, 8>
+getSortedUsedLDSGlobals(const Function *K) {
+  const Module *M = K->getParent();
+  const DataLayout &DL = M->getDataLayout();
+
+  // Collect all stastic LDS globals defined within the module.
+  SmallVector<const GlobalVariable *, 8> LDSGlobals =
+      AMDGPU::collectStaticLDSGlobals(M);
+
+  // Collect all those stastic LDS globals which are *used* within kernel K.
+  SmallVector<const GlobalVariable *, 8> UsedLDSGlobals;
+  for (const auto *GV : LDSGlobals) {
+    if (hasKernelUsesLDS(GV, K))
+      UsedLDSGlobals.push_back(GV);
+  }
+
+  // Sort LDS globals (which are used within kernel K) by alignment, descending,
+  // on ties, by size, descending, on ties, by name, lexicographical.
+  llvm::stable_sort(
+      UsedLDSGlobals,
+      [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool {
+        Align ALHS = fixAlignment(DL, LHS, AMDGPU::getAlign(DL, LHS));
+        Align ARHS = fixAlignment(DL, RHS, AMDGPU::getAlign(DL, RHS));
+        if (ALHS != ARHS) {
+          return ALHS > ARHS;
+        }
+
+        TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType());
+        TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType());
+        if (SLHS != SRHS) {
+          return SLHS > SRHS;
+        }
+
+        return LHS->getName() < RHS->getName();
+      });
+
+  // Module LDS which is possibly created by the "Lower Module LDS" pass, should
+  // be allocated at address 0, irrespective of its size and alignment.
+  SmallVector<const GlobalVariable *, 8> SortedLDSGlobals;
+  GlobalVariable *ModuleLDS = M->getGlobalVariable("llvm.amdgcn.module.lds");
+  if (ModuleLDS)
+    SortedLDSGlobals.push_back(ModuleLDS);
+
+  for (const auto *GV : UsedLDSGlobals) {
+    if (GV != ModuleLDS)
+      SortedLDSGlobals.push_back(GV);
+  }
+
+  return SortedLDSGlobals;
+}
+
 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF)
     : MachineFunctionInfo(), Mode(MF.getFunction()),
       IsEntryFunction(
@@ -36,6 +153,15 @@
   CallingConv::ID CC = F.getCallingConv();
   if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
     ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
+
+  if (IsModuleEntryFunction) {
+    SmallVector<const GlobalVariable *, 8> SortedLDSGlobals =
+        getSortedUsedLDSGlobals(&F);
+
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (const auto *GV : SortedLDSGlobals)
+      allocateLDSGlobal(DL, *GV);
+  }
 }
 
 unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
@@ -47,9 +173,9 @@
   Align Alignment =
       DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
 
-  /// TODO: We should sort these to minimize wasted space due to alignment
-  /// padding. Currently the padding is decided by the first encountered use
-  /// during lowering.
+  // Make sure that natural alignment of `GV` is correctly fixed.
+  Alignment = fixAlignment(DL, &GV, Alignment);
+
   unsigned Offset = StaticLDSSize = alignTo(StaticLDSSize, Alignment);
 
   Entry.first->second = Offset;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
@@ -31,6 +31,8 @@
 
 SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M);
 
+SmallVector<const GlobalVariable *, 8> collectStaticLDSGlobals(const Module *M);
+
 } // end namespace AMDGPU
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
@@ -29,6 +29,42 @@
                                        GV->getValueType());
 }
 
+SmallVector<const GlobalVariable *, 8>
+collectStaticLDSGlobals(const Module *M) {
+  SmallVector<const GlobalVariable *, 8> StaticLDSGlobals;
+
+  for (auto &GV : M->globals()) {
+    if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
+      continue;
+    }
+
+    if (!GV.hasInitializer()) {
+      // addrspace(3) without initializer implies cuda/hip extern __shared__
+      // the semantics for such a variable appears to be that all extern
+      // __shared__ variables alias one another, in which case this transform
+      // is not required
+      continue;
+    }
+
+    if (!isa<UndefValue>(GV.getInitializer())) {
+      // Initializers are unimplemented for local address space.
+      // Leave such variables in place for consistent error reporting.
+      continue;
+    }
+
+    if (GV.isConstant()) {
+      // A constant undef variable can't be written to, and any load is
+      // undef, so it should be eliminated by the optimizer. It could be
+      // dropped by the back end if not. This pass skips over it.
+      continue;
+    }
+
+    StaticLDSGlobals.push_back(&GV);
+  }
+
+  return StaticLDSGlobals;
+}
+
 bool userRequiresLowering(const SmallPtrSetImpl<GlobalValue *> &UsedList,
                           User *InitialUser) {
   // Any LDS variable can be lowered by moving into the created struct
@@ -75,36 +111,23 @@
 std::vector<GlobalVariable *>
 findVariablesToLower(Module &M,
                      const SmallPtrSetImpl<GlobalValue *> &UsedList) {
+  SmallVector<const GlobalVariable *, 8> StaticLDSGlobals =
+      collectStaticLDSGlobals(&M);
+
   std::vector<llvm::GlobalVariable *> LocalVars;
-  for (auto &GV : M.globals()) {
-    if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
-      continue;
-    }
-    if (!GV.hasInitializer()) {
-      // addrspace(3) without initializer implies cuda/hip extern __shared__
-      // the semantics for such a variable appears to be that all extern
-      // __shared__ variables alias one another, in which case this transform
-      // is not required
-      continue;
-    }
-    if (!isa<UndefValue>(GV.getInitializer())) {
-      // Initializers are unimplemented for local address space.
-      // Leave such variables in place for consistent error reporting.
-      continue;
-    }
-    if (GV.isConstant()) {
-      // A constant undef variable can't be written to, and any load is
-      // undef, so it should be eliminated by the optimizer. It could be
-      // dropped by the back end if not. This pass skips over it.
-      continue;
-    }
-    if (std::none_of(GV.user_begin(), GV.user_end(), [&](User *U) {
+
+  for (auto *GV : StaticLDSGlobals) {
+    GlobalVariable *GV2 = const_cast<GlobalVariable *>(GV);
+
+    if (std::none_of(GV2->user_begin(), GV2->user_end(), [&](User *U) {
           return userRequiresLowering(UsedList, U);
         })) {
       continue;
     }
-    LocalVars.push_back(&GV);
+
+    LocalVars.push_back(GV2);
   }
+
   return LocalVars;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
@@ -11,7 +11,7 @@
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 4
 ; CHECK-NEXT:    s_mov_b32 m0, -1
-; CHECK-NEXT:    ds_read_b32 v3, v0 offset:4
+; CHECK-NEXT:    ds_read_b32 v3, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 9
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_add_u32 s0, s0, 4
@@ -19,7 +19,7 @@
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-NEXT:    flat_store_dword v[0:1], v3
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x200
 ; CHECK-NEXT:    ds_write_b32 v0, v2
 ; CHECK-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll
--- a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll
@@ -1,5 +1,11 @@
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 < %s | FileCheck -check-prefix=HSA %s
 
+; LDS allocation is now done in the sorted order:
+; 1. First, alignment is fixed, even for missing alignment or underaligned cases since we know the
+;    size of LDS globals at compile time.
+; 2. Next, per kernel used LDS globals are sorted, based on natural alignment, on ties, based on size.
+; 3. Finally, memory is allocated in the above sorted order.
+
 @lds.align16.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16
 @lds.align16.1 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16
 
@@ -25,7 +31,7 @@
 ; There are two objects, so one requires padding to be correctly
 ; aligned after the other.
 
-; (38 -> 48) + 38 = 92
+; 38 + (10 pad) + 38 = 86
 
 ; I don't think it is necessary to add padding after since if there
 ; were to be a dynamically sized LDS kernel arg, the runtime should
@@ -94,9 +100,9 @@
   ret void
 }
 
-; (7 * 8) + (39 * 4) = 212
+; 156 + (4 pad) + 56  = 216
 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0:
-; HSA: workgroup_group_segment_byte_size = 212
+; HSA: workgroup_group_segment_byte_size = 216
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
@@ -110,7 +116,7 @@
   ret void
 }
 
-; (39 * 4) + (4 pad) + (7 * 8) = 216
+; 156 + (4 pad) + 56 = 216
 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1:
 ; HSA: workgroup_group_segment_byte_size = 216
 ; HSA: group_segment_alignment = 4
@@ -158,7 +164,7 @@
   ret void
 }
 
-; align 32, 8, 16
+; align 32, 16, 8
 ; 38 (+ 2 pad) + 38 + (18 pad) + 38 = 134
 ; HSA-LABEL: {{^}}test_round_size_3_order1:
 ; HSA: workgroup_group_segment_byte_size = 134
@@ -179,10 +185,10 @@
   ret void
 }
 
-; align 16, 32, 8
-; 38 + (26 pad) + 38 + (10 pad) + 38 = 150
+; align 32, 16, 8
+; 38 + (10 pad) + 38 + (10 pad) + 38 = 134
 ; HSA-LABEL: {{^}}test_round_size_3_order2:
-; HSA: workgroup_group_segment_byte_size = 150
+; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
@@ -200,10 +206,10 @@
   ret void
 }
 
-; align 16, 8, 32
+; align 32, 16, 8
 ; 38 + (2 pad) + 38 + (2 pad) + 38
 ; HSA-LABEL: {{^}}test_round_size_3_order3:
-; HSA: workgroup_group_segment_byte_size = 118
+; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
@@ -221,10 +227,10 @@
   ret void
 }
 
-; align 8, 32, 16
-; 38 + (26 pad) + 38 + (2 pad) + 38 = 142
+; align 32, 16, 8
+; 38 + (10 pad) + 38 + (10 pad) + 38 = 134
 ; HSA-LABEL: {{^}}test_round_size_3_order4:
-; HSA: workgroup_group_segment_byte_size = 142
+; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
@@ -242,10 +248,10 @@
   ret void
 }
 
-; align 8, 16, 32
-; 38 + (10 pad) + 38 + (2 pad) + 38 = 126
+; align 32, 16, 8
+; 38 + (10 pad) + 38 + (10 pad) + 38 = 134
 ; HSA-LABEL: {{^}}test_round_size_3_order5:
-; HSA: workgroup_group_segment_byte_size = 126
+; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
diff --git a/llvm/test/CodeGen/AMDGPU/lds-allocation.ll b/llvm/test/CodeGen/AMDGPU/lds-allocation.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-allocation.ll
@@ -0,0 +1,251 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=4 < %s | FileCheck -check-prefix=HSA %s
+
+; LDS allocation is now done in the sorted order as below:
+;
+; 1. First, alignment is fixed, even for missing alignment or underaligned cases since
+;    we know the size of LDS globals at compile time.
+; 2. Next, per kernel used LDS globals are sorted, based on natural alignment, on ties,
+;    based on size.
+; 3. Finally, memory is allocated in the above sorted order.
+
+
+; [TEST 0]
+;
+; No alignment fix is required for allocation as all the LDS globals are on required alignment
+; boundary. That is:
+; @lds.size.16.align.16 -> 16 byte boundary
+; @lds.size.8.align.8 ->  8 byte boundary
+; @lds.size.4.align.4 ->  4 byte boundary
+; @lds.size.2.align.2 ->  2 byte boundary
+; @lds.size.1.align.1 ->  1 byte boundary (any boundary)
+;
+; Sorted order based on alignment should be:
+; [@lds.size.16.align.16, @lds.size.8.align.8, @lds.size.4.align.4, @lds.size.2.align.2, @lds.size.1.align.1]
+;
+; Memory allocated in the sorted order should be:
+; 16 + 8 + 4 + 2 + 1  [= 31 bytes]
+
+@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
+@lds.size.2.align.2 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 2
+@lds.size.4.align.4 = internal unnamed_addr addrspace(3) global [4 x i8] undef, align 4
+@lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8
+@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16
+
+; HSA-LABEL: {{^}}fix_alignment_0:
+; HSA: amdhsa_group_segment_fixed_size 31
+define amdgpu_kernel void @fix_alignment_0() {
+  %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
+
+  %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.2 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
+
+  %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @lds.size.4.align.4 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
+
+  %lds.size.8.align.8.bc = bitcast [8 x i8] addrspace(3)* @lds.size.8.align.8 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.8.align.8.bc, align 8
+
+  %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
+
+  ret void
+}
+
+
+; [TEST 1]
+;
+; Fixed alignments for allocation should be:
+; @lds.size.9.align.8 -> 16 byte boundary
+; @lds.size.5.align.4 ->  8 byte boundary
+; @lds.size.3.align.2 ->  4 byte boundary
+; @lds.size.2.align.1 ->  2 byte boundary
+;
+; Sorted order based on *fixed* alignment should be:
+; [@lds.size.9.align.8, lds.size.5.align.4, lds.size.3.align.2, lds.size.2.align.1]
+;
+; Memory allocated in the sorted order should be:
+; 9 + (pad 7) + 5 + (pad 3) + 3 + (pad 1) + 2  [= 30 bytes]
+
+@lds.size.2.align.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1
+@lds.size.3.align.2 = internal unnamed_addr addrspace(3) global [3 x i8] undef, align 2
+@lds.size.5.align.4 = internal unnamed_addr addrspace(3) global [5 x i8] undef, align 4
+@lds.size.9.align.8 = internal unnamed_addr addrspace(3) global [9 x i8] undef, align 8
+
+; HSA-LABEL: {{^}}fix_alignment_1:
+; HSA: amdhsa_group_segment_fixed_size 30
+define amdgpu_kernel void @fix_alignment_1() {
+  %lds.size.2.align.1.bc = bitcast [2 x i8] addrspace(3)* @lds.size.2.align.1 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.2.align.1.bc, align 1
+
+  %lds.size.3.align.2.bc = bitcast [3 x i8] addrspace(3)* @lds.size.3.align.2 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.3.align.2.bc, align 2
+
+  %lds.size.5.align.4.bc = bitcast [5 x i8] addrspace(3)* @lds.size.5.align.4 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.5.align.4.bc, align 4
+
+  %lds.size.9.align.8.bc = bitcast [9 x i8] addrspace(3)* @lds.size.9.align.8 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.9.align.8.bc, align 8
+
+  ret void
+}
+
+
+; [TEST 2]
+;
+; Fixed alignments for allocation should be:
+; @lds.size.9.align.1 -> 16 byte boundary
+; @lds.size.10.align.2 -> 16 byte boundary
+; @lds.size.11.align.4 -> 16 byte boundary
+; @lds.size.12.align.8 -> 16  byte boundary
+;
+; Sorted order based on *fixed* alignment and then by size should be:
+; [@lds.size.12.align.8, @lds.size.11.align.4, @lds.size.10.align.2, @lds.size.9.align.1]
+;
+; Memory allocated in the sorted order should be:
+; 12 + (pad 4) + 11 + (pad 5) + 10 + (pad 6) + 9  [= 57 bytes]
+
+@lds.size.9.align.1 = internal unnamed_addr addrspace(3) global [9 x i8] undef, align 1
+@lds.size.10.align.2 = internal unnamed_addr addrspace(3) global [10 x i8] undef, align 2
+@lds.size.11.align.4 = internal unnamed_addr addrspace(3) global [11 x i8] undef, align 4
+@lds.size.12.align.8 = internal unnamed_addr addrspace(3) global [12 x i8] undef, align 8
+
+; HSA-LABEL: {{^}}fix_alignment_2:
+; HSA: amdhsa_group_segment_fixed_size 57
+define amdgpu_kernel void @fix_alignment_2() {
+  %lds.size.9.align.1.bc = bitcast [9 x i8] addrspace(3)* @lds.size.9.align.1 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.9.align.1.bc, align 1
+
+  %lds.size.10.align.2.bc = bitcast [10 x i8] addrspace(3)* @lds.size.10.align.2 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.10.align.2.bc, align 2
+
+  %lds.size.11.align.4.bc = bitcast [11 x i8] addrspace(3)* @lds.size.11.align.4 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.11.align.4.bc, align 4
+
+  %lds.size.12.align.8.bc = bitcast [12 x i8] addrspace(3)* @lds.size.12.align.8 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.12.align.8.bc, align 8
+
+  ret void
+}
+
+
+; [TEST 3]
+;
+; Fixed alignments for allocation should be:
+; @lds.size.5.align.2 -> 8 byte boundary
+; @lds.size.6.align.2 -> 8 byte boundary
+; @lds.size.7.align.2 -> 8 byte boundary
+; @lds.size.7.align.4 -> 8 byte boundary
+;
+; Sorted order based on *fixed* alignment and then by size should be:
+; [@lds.size.7.align.4, @lds.size.7.align.2, @lds.size.6.align.2, @lds.size.5.align.2]
+; OR
+; [@lds.size.7.align.2, @lds.size.7.align.4, @lds.size.6.align.2, @lds.size.5.align.2]
+;
+; Memory allocated in the sorted order should be:
+; 7 + (pad 1) + 7 + (pad 1) + 6 + (pad 2) + 5  [= 29 bytes]
+
+@lds.size.5.align.2 = internal unnamed_addr addrspace(3) global [5 x i8] undef, align 2
+@lds.size.6.align.2 = internal unnamed_addr addrspace(3) global [6 x i8] undef, align 2
+@lds.size.7.align.2 = internal unnamed_addr addrspace(3) global [7 x i8] undef, align 2
+@lds.size.7.align.4 = internal unnamed_addr addrspace(3) global [7 x i8] undef, align 4
+
+; HSA-LABEL: {{^}}fix_alignment_3:
+; HSA: amdhsa_group_segment_fixed_size 29
+define amdgpu_kernel void @fix_alignment_3() {
+  %lds.size.5.align.2.bc = bitcast [5 x i8] addrspace(3)* @lds.size.5.align.2 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.5.align.2.bc, align 2
+
+  %lds.size.6.align.2.bc = bitcast [6 x i8] addrspace(3)* @lds.size.6.align.2 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.6.align.2.bc, align 2
+
+  %lds.size.7.align.2.bc = bitcast [7 x i8] addrspace(3)* @lds.size.7.align.2 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.7.align.2.bc, align 2
+
+  %lds.size.7.align.4.bc = bitcast [7 x i8] addrspace(3)* @lds.size.7.align.4 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.7.align.4.bc, align 4
+
+  ret void
+}
+
+
+; [TEST 4]
+;
+; No alignment fix is required for allocation as all the LDS globals are on required alignment
+; boundary. That is:
+; @lds.size.17.align.16 -> 16 byte boundary
+; @lds.size.18.align.16 -> 16 byte boundary
+; @lds.size.19.align.16 -> 16 byte boundary
+; @lds.size.20.align.16 -> 16 byte boundary
+;
+; Sorted order based on *size* should be:
+; [@lds.size.20.align.16, @lds.size.19.align.16, @lds.size.18.align.16, @lds.size.17.align.16]
+;
+; Memory allocated in the sorted order should be:
+; 20 + (pad 12) + 19 + (pad 13) + 18 + (pad 14) + 17  [= 113 bytes]
+
+@lds.size.17.align.16 = internal unnamed_addr addrspace(3) global [17 x i8] undef, align 16
+@lds.size.18.align.16 = internal unnamed_addr addrspace(3) global [18 x i8] undef, align 16
+@lds.size.19.align.16 = internal unnamed_addr addrspace(3) global [19 x i8] undef, align 16
+@lds.size.20.align.16 = internal unnamed_addr addrspace(3) global [20 x i8] undef, align 16
+
+; HSA-LABEL: {{^}}fix_alignment_4:
+; HSA: amdhsa_group_segment_fixed_size 113
+define amdgpu_kernel void @fix_alignment_4() {
+  %lds.size.17.align.16.bc = bitcast [17 x i8] addrspace(3)* @lds.size.17.align.16 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.17.align.16.bc, align 16
+
+  %lds.size.18.align.16.bc = bitcast [18 x i8] addrspace(3)* @lds.size.18.align.16 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.18.align.16.bc, align 16
+
+  %lds.size.19.align.16.bc = bitcast [19 x i8] addrspace(3)* @lds.size.19.align.16 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.19.align.16.bc, align 16
+
+  %lds.size.20.align.16.bc = bitcast [20 x i8] addrspace(3)* @lds.size.20.align.16 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.20.align.16.bc, align 16
+
+  ret void
+}
+
+; [TEST 5]
+;
+; No alignment fix is required for allocation as all the LDS globals are on required alignment
+; boundary. That is:
+; @lds.size.1.align.1 -> any boundary
+; @lds.size.1.align.2 -> any boundary
+; @lds.size.1.align.4 -> any boundary
+; @lds.size.1.align.8 -> any boundary
+; @lds.size.1.align.16 -> any boundary
+;
+; Sorted order based on *size* should be:
+; [@size.1.align.16, @size.1.align.8, @size.1.align.4, @size.1.align.2, size.1.align.1]
+;
+; Memory allocated in the sorted order should be:
+; 1 + [gap 7] + 1 + [gap 3] + 1 + [gap 1] + 1 + 1  [= 16 bytes]
+
+@size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
+@size.1.align.2 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 2
+@size.1.align.4 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 4
+@size.1.align.8 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 8
+@size.1.align.16 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 16
+
+; HSA-LABEL: {{^}}fix_alignment_5:
+; HSA: amdhsa_group_segment_fixed_size 16
+define amdgpu_kernel void @fix_alignment_5() {
+  %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.1 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
+
+  %lds.size.1.align.2.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.2 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.1.align.2.bc, align 2
+
+  %lds.size.1.align.4.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.4 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.1.align.4.bc, align 4
+
+  %lds.size.1.align.8.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.8 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.1.align.8.bc, align 8
+
+  %lds.size.1.align.16.bc = bitcast [1 x i8] addrspace(3)* @size.1.align.16 to i8 addrspace(3)*
+  store i8 0, i8 addrspace(3)* %lds.size.1.align.16.bc, align 16
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
--- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -41,11 +41,11 @@
 ; CI-DAG: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
 ; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
 ; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
-; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
+; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 12, [[ADDRW]]
 
 ; GCN: s_barrier
 
-; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
+; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 28, [[ADDRW]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
@@ -11,13 +11,17 @@
 ; The one with the suboptimal order resulting in extra padding exceeds
 ; the desired limit
 
-; The padding estimate heuristic used by the promote alloca pass
-; is mostly determined by the order of the globals,
-
-; Raw usage = 1060 bytes
-; Rounded usage:
-; 292 + (4 pad) + 256 + (8 pad) + 512 = 1072
-; 512 + (0 pad) + 256 + (0 pad) + 292 = 1060
+; The padding estimate heuristic used by the promote alloca pass is mostly
+; determined based on following allocation order:
+;
+; 1. First, alignment is fixed, even for missing alignment or underaligned
+;    cases since we know the size of LDS globals at compile time.
+; 2. Next, per kernel used LDS globals are sorted, based on natural alignment,
+;    on ties, based on size.
+; 3. Finally, memory is allocated in the above sorted order.
+
+; Raw usage: 512 + 292 + 256 = 1060 bytes
+; Rounded usage: 512 + (0 pad) + 292 + (12 pad) + 256 = 1072 bytes
 
 ; At default occupancy guess of 7, 2340 bytes available total.
 
@@ -31,7 +35,7 @@
 
 
 ; GCN-LABEL: {{^}}promote_alloca_size_order_0:
-; GCN: workgroup_group_segment_byte_size = 1060
+; GCN: workgroup_group_segment_byte_size = 1072
 define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
 entry:
   %stack = alloca [5 x i32], align 4, addrspace(5)