diff --git a/llvm/include/llvm/Support/Alignment.h b/llvm/include/llvm/Support/Alignment.h
--- a/llvm/include/llvm/Support/Alignment.h
+++ b/llvm/include/llvm/Support/Alignment.h
@@ -345,6 +345,8 @@
   return Rhs && *Rhs > Lhs ? *Rhs : Lhs;
 }
 
+inline Align max(Align Lhs, Align Rhs) { return Lhs > Rhs ? Lhs : Rhs; }
+
 #ifndef NDEBUG
 // For usage in LLVM_DEBUG macros.
 inline std::string DebugStr(const Align &A) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -9,11 +9,115 @@
 #include "AMDGPUMachineFunction.h"
 #include "AMDGPUPerfHintAnalysis.h"
 #include "AMDGPUSubtarget.h"
+#include "Utils/AMDGPULDSUtils.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
+// Fix natural alignment for LDS global `GV` based on its size.
+static Align fixAlignment(const DataLayout &DL, const GlobalVariable *GV,
+                          Align Alignment) {
+  TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType());
+
+  if (GVSize > 8) {
+    // We might want to use a b96 or b128 load/store
+    Alignment = max(Alignment, Align(16));
+  } else if (GVSize > 4) {
+    // We might want to use a b64 load/store
+    Alignment = max(Alignment, Align(8));
+  } else if (GVSize > 2) {
+    // We might want to use a b32 load/store
+    Alignment = max(Alignment, Align(4));
+  } else if (GVSize > 1) {
+    // We might want to use a b16 load/store
+    Alignment = max(Alignment, Align(2));
+  }
+
+  return Alignment;
+}
+
+// Collect all kernel functions within which `GV` is used.
+static std::set<const Function *> collectUsedKernels(const GlobalVariable *GV) {
+  std::set<const Function *> UsedKernels;
+  SmallVector<const User *, 8> UserStack(GV->users());
+  SmallPtrSet<const User *, 8> VisitedUsers;
+
+  while (!UserStack.empty()) {
+    const auto *UU = UserStack.pop_back_val();
+
+    if (!VisitedUsers.insert(UU).second)
+      continue;
+
+    if (isa<GlobalVariable>(UU))
+      continue;
+
+    if (isa<Constant>(UU)) {
+      append_range(UserStack, UU->users());
+      continue;
+    }
+
+    const auto *I = dyn_cast<Instruction>(UU);
+    assert(I && "Expected an instruction");
+    const Function *F = I->getFunction();
+    if (AMDGPU::isModuleEntryFunctionCC(F->getCallingConv()))
+      UsedKernels.insert(F);
+  }
+
+  return UsedKernels;
+}
+
+static std::vector<const GlobalVariable *>
+getSortedUsedLDSGlobals(const Function *K) {
+  const Module *M = K->getParent();
+  const DataLayout &DL = M->getDataLayout();
+
+  // Collect all stastic LDS globals defined within the module.
+  std::vector<const GlobalVariable *> LDSGlobals =
+      AMDGPU::collectStaticLDSGlobals(M);
+
+  // Collect all those stastic LDS globals which are *used* within kernel K.
+  std::vector<const GlobalVariable *> UsedLDSGlobals;
+  for (const auto *GV : LDSGlobals) {
+    if (llvm::is_contained(collectUsedKernels(GV), K))
+      UsedLDSGlobals.push_back(GV);
+  }
+
+  // Sort LDS globals (which are used within kernel K) by alignment, descending,
+  // on ties, by size, descending, on ties, by name, lexicographical.
+  llvm::stable_sort(
+      UsedLDSGlobals,
+      [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool {
+        Align ALHS = fixAlignment(DL, LHS, AMDGPU::getAlign(DL, LHS));
+        Align ARHS = fixAlignment(DL, RHS, AMDGPU::getAlign(DL, RHS));
+        if (ALHS != ARHS) {
+          return ALHS > ARHS;
+        }
+
+        TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType());
+        TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType());
+        if (SLHS != SRHS) {
+          return SLHS > SRHS;
+        }
+
+        return LHS->getName() < RHS->getName();
+      });
+
+  // Module LDS which is possibly created by the "Lower Module LDS" pass, should
+  // be allocated at address 0, irrespective of its size and alignment.
+  std::vector<const GlobalVariable *> SortedLDSGlobals;
+  GlobalVariable *ModuleLDS = M->getGlobalVariable("llvm.amdgcn.module.lds");
+  if (ModuleLDS)
+    SortedLDSGlobals.push_back(ModuleLDS);
+
+  for (const auto *GV : UsedLDSGlobals) {
+    if (GV != ModuleLDS)
+      SortedLDSGlobals.push_back(GV);
+  }
+
+  return SortedLDSGlobals;
+}
+
 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF)
     : MachineFunctionInfo(), Mode(MF.getFunction()),
       IsEntryFunction(
@@ -36,6 +140,14 @@
   CallingConv::ID CC = F.getCallingConv();
   if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
     ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
+
+  if (IsModuleEntryFunction) {
+    std::vector<const GlobalVariable *> SortedLDSGlobals =
+        getSortedUsedLDSGlobals(&F);
+
+    for (const auto *GV : SortedLDSGlobals)
+      allocateLDSGlobal(F.getParent()->getDataLayout(), *GV);
+  }
 }
 
 unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
@@ -47,6 +159,9 @@
   Align Alignment =
       DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
 
+  // Make sure that natural alignment of `GV` is correctly fixed.
+  Alignment = fixAlignment(DL, &GV, Alignment);
+
   /// TODO: We should sort these to minimize wasted space due to alignment
   /// padding. Currently the padding is decided by the first encountered use
   /// during lowering.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
@@ -31,6 +31,8 @@
 
 SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M);
 
+std::vector<const GlobalVariable *> collectStaticLDSGlobals(const Module *M);
+
 } // end namespace AMDGPU
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
@@ -122,6 +122,51 @@
   return UsedList;
 }
 
+std::vector<const GlobalVariable *> collectStaticLDSGlobals(const Module *M) {
+  std::vector<const GlobalVariable *> StaticLDSGlobals;
+
+  for (const auto &GV : M->globals()) {
+    if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
+      // Skip non LDS global.
+      continue;
+    }
+
+    if (GV.hasExternalLinkage()) {
+      // HIP uses an unsized array `extern __shared__ T s[]` or similar
+      // zero-sized type in other languages to declare the dynamic shared
+      // memory variable whose size is not known at the compile time. They will
+      // be allocated by the runtime and placed directly after the statically
+      // allocated ones. Skip such dynamic shared memory variables.
+      assert(GV.getType()->isEmptyTy() &&
+             "Expected dynamic shared memory variable type to be empty");
+      continue;
+    }
+
+    if (!GV.hasInitializer()) {
+      // Static LDS globals should have "UndefValue" as an initializer. Skip LDS
+      // globals which do not have an initializer. ISEL pass will catch such
+      // static LDS globals and report error accordingly.
+      continue;
+    }
+
+    if (!isa<UndefValue>(GV.getInitializer())) {
+      // Static LDS globals should have "UndefValue" as an initializer. Skip LDS
+      // globals which have an initialzer but it is not "UndefValue". ISEL pass
+      // will catch such  static LDS globals and report error accordingly.
+      continue;
+    }
+
+    if (GV.isConstant()) {
+      // A constant LDS global cannot be allocated. Skip it.
+      continue;
+    }
+
+    StaticLDSGlobals.push_back(&GV);
+  }
+
+  return StaticLDSGlobals;
+}
+
 } // end namespace AMDGPU
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
@@ -11,7 +11,7 @@
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 4
 ; CHECK-NEXT:    s_mov_b32 m0, -1
-; CHECK-NEXT:    ds_read_b32 v3, v0 offset:4
+; CHECK-NEXT:    ds_read_b32 v3, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 9
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_add_u32 s0, s0, 4
@@ -19,7 +19,7 @@
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-NEXT:    flat_store_dword v[0:1], v3
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x200
 ; CHECK-NEXT:    ds_write_b32 v0, v2
 ; CHECK-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll
--- a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll
@@ -96,7 +96,7 @@
 
 ; (7 * 8) + (39 * 4) = 212
 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0:
-; HSA: workgroup_group_segment_byte_size = 212
+; HSA: workgroup_group_segment_byte_size = 216
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
@@ -182,7 +182,7 @@
 ; align 16, 32, 8
 ; 38 + (26 pad) + 38 + (10 pad) + 38 = 150
 ; HSA-LABEL: {{^}}test_round_size_3_order2:
-; HSA: workgroup_group_segment_byte_size = 150
+; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
@@ -203,7 +203,7 @@
 ; align 16, 8, 32
 ; 38 + (2 pad) + 38 + (2 pad) + 38
 ; HSA-LABEL: {{^}}test_round_size_3_order3:
-; HSA: workgroup_group_segment_byte_size = 118
+; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
@@ -224,7 +224,7 @@
 ; align 8, 32, 16
 ; 38 + (26 pad) + 38 + (2 pad) + 38 = 142
 ; HSA-LABEL: {{^}}test_round_size_3_order4:
-; HSA: workgroup_group_segment_byte_size = 142
+; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
@@ -245,7 +245,7 @@
 ; align 8, 16, 32
 ; 38 + (10 pad) + 38 + (2 pad) + 38 = 126
 ; HSA-LABEL: {{^}}test_round_size_3_order5:
-; HSA: workgroup_group_segment_byte_size = 126
+; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
 define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
--- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -41,11 +41,11 @@
 ; CI-DAG: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
 ; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
 ; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
-; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
+; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 12, [[ADDRW]]
 
 ; GCN: s_barrier
 
-; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
+; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 28, [[ADDRW]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
@@ -31,7 +31,7 @@
 
 
 ; GCN-LABEL: {{^}}promote_alloca_size_order_0:
-; GCN: workgroup_group_segment_byte_size = 1060
+; GCN: workgroup_group_segment_byte_size = 1072
 define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
 entry:
   %stack = alloca [5 x i32], align 4, addrspace(5)